In [1]:
### RUN EVERY TIME: FACTORS BASED ON BLOOMBERG ECONOMIC INDICES RELEASES HISTORY DATA (NO CHANGES)

In [2]:
### RUN EVERY TIME: INITIALIZATION (REMOVE PARALLELIZATION MODULES) (CELL TO REPLACE)

import pandas as pd
import numpy as np
from datetime import date, datetime
import math
from statsmodels.tsa.x13 import x13_arima_analysis
from itertools import combinations_with_replacement
from sklearn.decomposition import PCA
import os
import gc
import re
### Plotting:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import seaborn as sns
### Profiling:
#%load_ext line_profiler

In [3]:
### RUN ONLY WHEN ARIMA X13 SA LAUNCHING 

### Warnings hiding:
import warnings
warnings.filterwarnings('ignore')
### Seasonal adjustment module paths set up:
%env X13PATH = C:\Users\ighar\AppData\Roaming\jupyter\x13as
%env X12PATH = C:\Users\ighar\AppData\Roaming\jupyter\x13as
#%env X13PATH = C:\Users\igharok\Desktop\Job\ARIMA
#%env X12PATH = C:\Users\igharok\Desktop\Job\ARIMA
#%env

env: X13PATH=C:\Users\ighar\AppData\Roaming\jupyter\x13as
env: X12PATH=C:\Users\ighar\AppData\Roaming\jupyter\x13as


In [4]:
### VERSION CONTROL (NO CHANGES)

from platform import python_version
print('pandas version: ', pd.__version__)
print('numpy version: ', np.__version__)
print('python version: ', python_version())

pandas version:  2.1.4
numpy version:  1.26.3
python version:  3.11.5


In [5]:
### RUN EVERY TIME: PARAMETERS & CONSTANTS (FILE NAMES OF OUTPUT HDF FILES MODIFIED)

### GENERAL CONSTANTS:
All = slice(None)
### Business year length:
int_bus_year = 260

### EXCEL DATA EXTRACTION:
list_na_excel_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null',
                        '#N/A Requesting Data...', '#N/A Invalid Security', '#N/A Field Not Applicable']
### Raw data path and sheets:
str_path_bb_idx_source = 'Data_Files/Source_Files/Bloomberg_Eco_Indices.xlsx'
str_all_sheet = 'All Eco Const'
### Flags data path and sheets:
str_path_bb_idx_flags = 'Data_Files/Source_Files/Bloomberg_Eco_Flags_Extended.xlsx'
str_flag_sheet = 'Bloomberg Description'
### Source data constants:
int_idx_cols = 12
### TRANSFORMED BLOOMBERG DATA KEEPING:
### HDF file with converted source data:
str_path_bb_idx_hdf = 'Data_Files/Source_Files/2024/Bloomberg_Preparation_2024.h5' ### LINE TO MODIFY
str_key_flags = 'flags_exported' ### Acadian flags list
str_key_exported = 'all_idx_exported' ### Raw export with only replacing zero dates and after 2021-01-01 dates with np.NaN
str_key_raw_filled = 'all_idx_raw_filled' ### Raw export with initial dates, dates gaps, absent date columns filled
str_key_raw_history = 'raw_history' ### Export with all the corrections and fillings (restructured to [Index_Name -> Data_Date -> Observation_Date] | Value series)
str_key_bday_history = 'bday_history' ### Raw history vector with observation dates moved to nearest future business dates
str_key_types_info = 'types_info' ### Dataframe with 'Type_Prime' / 'Sub_Type' / 'Region' groups descriptions
str_key_flags_typed = 'flags_typed' ### Dataframe with economic indices descriptions taking into account 
str_key_survey_history = 'survey_history' ### Release values are replaced with Survey Medians to normalize it ### LINE TO ADD
str_key_norm_filled = 'all_idx_norm_filled' ### Normalized values with initial dates, dates gaps, absent date columns filled ### LINE TO ADD

### DATA TRANSFORMATION:
### Date range:
datetime_start = datetime(1984, 12, 31) # Start date for efficacy measures
date_start = datetime_start.date()
datetime_end = datetime(2020, 8, 31) # End date for efficacy measures
date_end = datetime_end.date()
num_date_control = (datetime_end.year + 1) * 10000 ### LINE TO ADD
idx_date_range = pd.date_range(date_start, date_end, freq = 'B')
datetime_basis = datetime(1993, 12, 31) # End date for efficacy measures
date_basis = datetime_basis.date()
### Gaps filling options:
int_revision_shift = 1
int_final_shift = 2
int_first_mean_length = 12
dict_final_only_lag = {}
dict_final_only_lag['Quarterly'] = 90 // 2
dict_final_only_lag['Monthly'] = 30 // 2
dict_final_only_lag['Other'] = 7 // 2

### TRANSFORMATION TO MOM & MATRIX Z-SCORING:
### Group tickers rebasing options:
int_not_to_rebase_term = 7 ### Term in years for min ticker data date when we do not need to rebase it with basis group ticker
int_not_to_rebase_diff = 2 ### Minimal difference in years between basis ticker and other group ticker min date when we need to rebase group ticker
### CPU count to use during multiprocessing:
int_cpu_count = 4
### Cumprod shifts for monthly data frequency:
dict_cumprod_step = {}
dict_cumprod_step['MoM%'] = 1
dict_cumprod_step['QoQ%'] = 3
dict_cumprod_step['YoY%'] = 12
### Stock-like series shifts for MoM transformation:
dict_mom_shift = {}
dict_mom_shift['Monthly'] = 1
dict_mom_shift['Other'] = 4
### Z-scoring options:
int_winsorize_bound = 4
flo_winsorize_tolerance = 0.0001
int_winsorize_steps_limit = 5
### Diagonal options:
int_min_years_z_score = 3
int_max_years_z_score = 10
date_diag_start = datetime(1994, 1, 1)
### HDF file with matrices:
str_path_bb_matrix_mom_hdf = 'Data_Files/Source_Files/2024/Matrix_Eco_Indices_mom_2024.h5' ### LINE TO MODIFY
str_key_matrix_z = 'matrix_cube_z_scored'

### MATRICES AGGREGATION:
### Data filling limit
int_fill_limit = 20
### Average region correlation matrix weight for daily correlation matrix shrinking:
flo_reg_weight = 0.5
### Regions weights:
dict_region_weight = {}
dict_region_weight['US'] = 0.50
dict_region_weight['Europe'] = 0.25
dict_region_weight['Japan'] = 0.15
dict_region_weight['UK'] = 0.10
### HDF file with group averages:
str_path_group_matrix_mom_hdf = 'Data_Files/Source_Files/2024/Matrix_Groups_mom_2024.h5' ### LINE TO MODIFY
str_key_group_matrix = 'matrix_cube_groups'
### HDF file with overall event dates as series index:
str_path_overall_dates_hdf = 'Data_Files/Source_Files/2024/Overall_Dates_2024.h5' ### LINE TO MODIFY
str_key_event_dates = 'overall_event_dates'
str_key_obs_dates = 'overall_obs_dates'
str_key_triangle_dates = 'overall_triangle_dates'
### HDF file with sub type averages:
str_path_sub_matrix_mom_hdf = 'Data_Files/Source_Files/2024/Matrix_Sub_mom_2024.h5' ### LINE TO MODIFY
str_key_sub_matrix = 'matrix_cube_subs'
### Global indices files:
str_path_ind_matrix_inf_mom_hdf = 'Data_Files/Source_Files/2024/Matrix_Global_mom_2024.h5' ### LINE TO MODIFY
str_key_global_matrix = 'matrix_cube_globals'
### Global indices compositions:
dict_global_index_hdf = {}
dict_global_index_hdf[('INF_mom')] = str_path_ind_matrix_inf_mom_hdf
### Global indices names:
dict_global_index_name = {}
dict_global_index_name[('INF_mom')] = 'Inflation Index (MoM)'
### Rolling correlation tail length:
int_corr_tail = 5
### A-la Newey-West adjustment maximum lag:
int_n_w_lag = 4
### Covariance subsamples number:
int_cov_samples = 22
### Minimal years to use column for PCA performing:
int_min_years_pca = 7
### CPU count to use during multiprocessing:
int_cpu_count = 4
### HDF file with weights collection:
str_path_bb_weights_hdf = 'Data_Files/Source_Files/2024/FPC_Weights_2024_mom.h5' ### LINE TO MODIFY
### HDF file with correlation matrices collection:
str_path_bb_corrs_hdf = 'Data_Files/Source_Files/2024/FPC_Correlations_2024_mom.h5' ### LINE TO MODIFY
### HDF file with percentiles:
str_path_bb_percentiles_hdf = 'Data_Files/Source_Files/2024/Main_Percentiles_2024_mom.h5' ### LINE TO MODIFY
### Percentils calculation options:
int_ptile_months = 10 * 12
int_ave_months = 5 * 12
int_halflife_months = 1 * 12

### DIAGONALS COLLECTION:
### HDF file with diagonals:
str_path_bb_diag_hdf = 'Data_Files/Source_Files/2024/Matrix_Diagonals_2024_mom.h5' ### LINE TO MODIFY
str_key_diag_daily_mom = 'matrix_diagonal_mom'
str_key_diag_group_mom = 'groups_diagonal_mom'
str_key_diag_sub_mom = 'sub_types_diagonal_mom'
str_key_diag_agg_z_lim = 'aggregated_diagonal_z_limited'
### Global indices diagonals keys:
dict_global_index_diag_key = {}
dict_global_index_diag_key[('INF_mom')] = 'global_diagonal_inf_mom'

In [6]:
### DEFINING OBSERVATION DATE VECTOR EXTRACTION (NO CHANGES)

def get_obs_date_vector(str_ticker, str_date, bool_exact_date = False, bool_drop_levels = False):
    ### Vector for exact date:
    if bool_exact_date:
        ser_obs_date = pd.read_hdf(str_path_bb_matrix_hdf, key = str_key_matrix_z, where = '(Index_Name == str_ticker) & (Observation_Date == str_date)') ### LINE TO MODIFY
    ### Vector for nearest date:        
    else:
        ### Loading full ticker series:        
        ser_z_scored = pd.read_hdf(str_path_bb_matrix_hdf, key = str_key_matrix_z, where = '(Index_Name == str_ticker) & (Observation_Date <= str_date)') ### LINE TO MODIFY
        ### Extracting data for max date less or equal to needed date:
        ser_obs_date = ser_z_scored.loc[All, All, [ser_z_scored.index.levels[-1].max()]]
    ### Dropping constant index levels if needed:
    if bool_drop_levels:
        return ser_obs_date.droplevel(['Index_Name', 'Observation_Date'])
    else:
        return ser_obs_date

In [7]:
### DEFINING WEIGHTED AVERAGE FOR DATAFRAME COLUMNS (NO CHANGES)

def columns_average(df_series, list_weights = False): 
    ### Single column check
    if (len(df_series.columns) > 1):
        ### Equal weights list creating:
        if isinstance(list_weights, bool):
            list_weights = [1] * len(df_series.columns)
        ### Dataframe of weights initialising:
        df_weights = pd.DataFrame(np.NaN, index = df_series.index, columns = df_series.columns)
        for iter_num, iter_col in enumerate(df_weights.columns):
            df_weights[iter_col] = list_weights[iter_num]
        ### Zeroing weights for NaN values:
        for iter_col in df_weights.columns:
            df_weights.loc[df_series[iter_col].isna(), iter_col] = 0
        ser_mean = (df_series.multiply(df_weights).sum(axis = 1)).div(df_weights.sum(axis = 1))    
        ### Results output:
        del df_series
        del df_weights    
        gc.collect()
    else:
        ser_mean = df_series.squeeze()
        del df_series
        gc.collect()        
    return ser_mean

In [8]:
### DEFINING EXPONENTIAL WEIGHT (NO CHANGES)

def exp_weight_single(halflife_len = 3, num_element = 0):
    ### Weight calculating:
    num_period_factor = math.exp(math.log(0.5) / round(halflife_len))
    num_weight = np.exp(math.log(num_period_factor) * num_element)
    ### Result output:
    return num_weight

In [9]:
### DEFINING WEIGHTED AVERAGE (NO CHANGES)

def weighted_average(ser_data, ser_weight = False, int_min_count = 0):
    ### Default output:
    num_result = np.NaN
    ### Checking for data presence:
    if (ser_data.count() > int_min_count):       
        ### Checking for weights dataset:
        if isinstance(ser_weight, bool):
            ### Calculating of simple average:
            num_result = np.nanmean(ser_data.values)
        else:
            ### Weights filtering:
            list_weight = ser_weight[ser_data.dropna().index].values
            ### Checking for weights presence:
            if np.nansum(list_weight):
                ### Data filtering:
                list_data = ser_data.dropna().values
                ### Weighted average calculating:
                num_result = np.nansum(list_data * list_weight) / np.nansum(list_weight)
    ### Results output:
    return num_result

In [None]:
### RUN TO RE-EXPORT DATA: HISTORY DATA TRANSFORMATION : MOM ONLY (CELL TO REPLACE)

### Defining Economic Index series transformation:
def complex_transform(ser_name, idx_date_range, df_flags, int_max_name_length, int_min_years, str_path_bb_matrix_hdf, bool_perform_sa = False):
    ### Defining triangle extraction:
    def triangle_filter(ser_date):
        ### Extracting particular Data Date:
        date_diag = ser_date.index.get_level_values('Data_Date')[0]
        ### Dropping constant level:
        ser_result = ser_date.droplevel('Data_Date')
        ### Filtering over-diagonal values:
        ser_result = ser_result[ser_result.index >= date_diag] 
        ### Results output:
        return ser_result
    ### Period-over-period-percent ticker values transforming to stock-like series:
    def pop_to_level(ser_date, int_step):
        ### Dropping constant level:
        ser_result = ser_date.droplevel('Observation_Date')
        ### Basis initiating:
        flo_basement = 1.0
        ### Factor initiating: 
        flo_next_brick  = 1.0
        ### Looping over month numbers:
        for iter_period in range(min(int_step, len(ser_result.index))):         
            ### Basement building up:
            flo_basement = flo_basement * flo_next_brick
            ### Next basement brick producing:
            flo_next_brick = ((flo_next_brick ** (iter_period)) * (ser_result.iloc[iter_period] ** (1 / int_step))) ** (1 / (iter_period + 1)) 
            ### Jumping cumulative product performing:
            idx_iter_data = ser_result.index[iter_period :: int_step]
            ser_result.loc[idx_iter_data] = ser_result.loc[idx_iter_data].cumprod() * flo_basement       
        ### Results output:            
        return ser_result    
    ### X13 ARIMA Seasonality adjustment model:
    def perform_x13_sa(ser_date):
        ### Dropping constant level:        
        ser_result = ser_date.droplevel('Observation_Date')
        ### Check for not empty vector:
        if (ser_result.count() > 0):
            ### Check for minimal quantity of observations to perform seasonality adjustment:
            if (ser_result.last_valid_index() - ser_result.first_valid_index()).days >= (int_min_years * 365):   
                ### Naming series for x13 performing:
                ser_result.name = 'Ticker'
                ### Calculating shift value to make all series positive:
                flo_positron = abs(ser_result.min()) * 2
                try:
                    ### Performing seasonality adjustment:
                    ser_result = x13_arima_analysis(ser_result + flo_positron, outlier = True, trading = True).seasadj - flo_positron
#                    print('SA success : ', str_index_name, ' : ', ser_date.index.get_level_values('Observation_Date')[0].date())             
                except Exception as error:
                    print('SA error : ', str_index_name, ' : ', ser_date.index.get_level_values('Observation_Date')[0].date(), ' : ', type(error).__name__)
                    pass
        ### Results output:                
        return ser_result 
#        return pd.concat([ser_result], keys = [ser_date.index.get_level_values('Observation_Date')[0]], names = ['Observation_Date'])    
    ### Extracting Observation Date column for ticker:
    def get_obs_date_vector(str_ticker, str_path_bb_matrix_hdf, str_date, bool_exact_date = False, bool_drop_levels = True):
        ### Vector for exact date:
        if bool_exact_date:
            ser_obs_date = pd.read_hdf(str_path_bb_matrix_hdf, key = str_key_matrix_z, where = 'Index_Name == str_ticker & Observation_Date == str_date')
        ### Vector for nearest date:        
        else:
            ### Loading full ticker series:        
            ser_z_scored = pd.read_hdf(str_path_bb_matrix_hdf, key = str_key_matrix_z, where = 'Index_Name == str_ticker & Observation_Date <= str_date')
            ### Extracting data for max date less or equal to needed date:
            ser_obs_date = ser_z_scored.loc[All, All, [ser_z_scored.index.levels[-1].max()]]
        ### Dropping constant index levels if needed:
        if bool_drop_levels:
            return ser_obs_date.droplevel(['Index_Name', 'Observation_Date'])
        else:
            return ser_obs_date    
    ### Defining time-vector z-scoring procedure:    
    def by_date_z_score(ser_date, int_winsorize_bound, flo_tolerance, int_winsorize_steps_limit, int_min_years_adj, 
                        str_path_bb_matrix_hdf, str_basis_index, bool_rebase_flag, list_continue_rebase):
        ### Check for empty vector (doing nothing):
        if ser_date.count():
            ### Check for non-constant vector:
            if (ser_date.std() > flo_tolerance):
                ### Check for minimal quantity of observations to z-score:
                if (ser_date.last_valid_index()[0] - ser_date.first_valid_index()[0]).days >= (int_min_years_adj * 365):   
                    ### Calculating of z scores:
                    ser_date = (ser_date - ser_date.mean()) / ser_date.std()        
                    bool_to_winsor = True   
                    int_iter = 1
                    while (bool_to_winsor): 
                        int_iter += 1                
                        ### Value based winsorization:                
                        ser_date.clip(lower = -int_winsorize_bound, upper = int_winsorize_bound, inplace = True)
                        ### Recalculating of z scores:
                        ser_date = (ser_date - ser_date.mean()) / ser_date.std()
                        ### Checking for boundaries and steps:
                        if((ser_date.loc[ser_date.abs() >= (int_winsorize_bound + flo_tolerance)].count() == 0) | (int_iter > int_winsorize_steps_limit)):
                            bool_to_winsor = False
                    ### Checking if rebasing needed:
                    if (bool(str_basis_index) & bool_rebase_flag & list_continue_rebase[0]):
                        ### Extracting column from z-scored basis ticker series:
                        str_obs_date = ser_date.index[0][1].strftime('%Y-%m-%d')
                        ser_basis_date = get_obs_date_vector(str_basis_index, str_path_bb_matrix_hdf, str_obs_date, bool_exact_date = False, bool_drop_levels = True)
                        ### Selecting only intersected time interval:
                        ser_basis_part = ser_basis_date.loc[ser_date.first_valid_index()[0]: ]
                        ### Rebasing ticker:
                        ser_date = ser_date * ser_basis_part.std() + ser_basis_part.mean()
                        ### Checking if future rebasing needed:
                        if ((abs(ser_basis_part.std() - 1) < flo_tolerance) & (abs(ser_basis_part.mean()) < flo_tolerance)):
                            list_continue_rebase[0] = False
                else:
                    ### Killing values that we can't z-score
                    ser_date.loc[All] = np.NaN
            else:
                ### Check for minimal quantity of observations to z-score:
                if (ser_date.last_valid_index()[0] - ser_date.first_valid_index()[0]).days >= (int_min_years_adj * 365):             
                    ### Constant values demeaning:
                    ser_date = ser_date - ser_date.mean()
                    ### Checking if rebasing needed:
                    if (bool(str_basis_index) & bool_rebase_flag & list_continue_rebase[0]):
                        ### Extracting column from z-scored basis ticker series:
                        str_obs_date = ser_date.index[0][1].strftime('%Y-%m-%d')                    
                        ser_basis_date = get_obs_date_vector(str_basis_index, str_path_bb_matrix_hdf, str_obs_date, bool_exact_date = False, bool_drop_levels = True)
                        ### Selecting only intersected time interval:
                        ser_basis_part = ser_basis_date.loc[ser_date.first_valid_index()[0]: ]
                        ### Rebasing ticker:
                        ser_date = ser_date * ser_basis_part.std() + ser_basis_part.mean()
                        ### Checking if future rebasing needed:
                        if ((abs(ser_basis_part.std() - 1) < flo_tolerance) & (abs(ser_basis_part.mean()) < flo_tolerance)):
                            list_continue_rebase[0] = False
                else:
                    ### Killing values that we can't z-score
                    ser_date.loc[All] = np.NaN
        ### Memory optimization:
        ser_date = ser_date.astype('float32')
        return ser_date    
    ### EI name extracting:
    str_index_name = ser_name.index.get_level_values(0)[0]
    ### Observation dates reindexation:    
    print(ser_name.index.get_level_values(0)[0], ': Reindexation')    
    idx_observation_range = ser_name.index.get_level_values('Observation_Date').unique().intersection(idx_date_range).sort_values()
    ser_full = ser_name.droplevel('Index_Name').unstack('Data_Date').reindex(idx_observation_range).stack('Data_Date', dropna = False).squeeze()
    ser_full = ser_full.swaplevel()
    ser_full.index.rename('Observation_Date', level = -1, inplace = True)    
    ### Forward filling for each data date:
    ser_full = ser_full.groupby('Data_Date').ffill()   
    ### Diagonalization:
    ser_triangle = ser_full.groupby('Data_Date').apply(triangle_filter).sort_index()
    ### Flags extracting:
    ser_flags = df_flags.loc[str_index_name, All].squeeze() 
    ### 'TAR' type checking:
    if (ser_flags['Type_Prime'] == 'TAR'):
        print(str_index_name, ': TAR Primary Type ignoring')        
        pass
    ### Flags-based transforming:
#    else:
    elif (ser_flags['Type_Prime'] == 'INF'):        
        ### Indices of NA values collecting:
        idx_isna = ser_triangle.loc[ser_triangle.isna()].index
        ### Transforming to stock-like series:
        if (ser_flags['Processing'] in ['Index', 'Level', 'Level%']):
            ser_stock = ser_triangle
        elif (ser_flags['Processing'] == 'Flow'):
            print(str_index_name, ': Transformation to stock-like series: Cumulative sum')
            ### Filling empty values:
            ser_triangle = ser_triangle.fillna(0)
            ### Cumulative sum for each observation date calculating:
            ser_stock = ser_triangle.groupby('Observation_Date').cumsum()
            ### Dropping NA values:
            ser_stock.loc[idx_isna] = np.NaN
        else:
            print(str_index_name, ': Transformation to stock-like series: Cumulative product')
            ### Filling empty values:
            ser_triangle = ser_triangle.fillna(0)
            ### Percents to multipliers converting:
            ser_stock = 1 + ser_triangle / 100
            ### Calculating with needed periodicity:
            if (ser_flags['Frequency'] == 'Monthly'):
                int_step = dict_cumprod_step[ser_flags['Processing']]
                ### Period-by-period cumprod with rebasing:
                ser_stock = ser_stock.groupby('Observation_Date').apply(pop_to_level, int_step).swaplevel().sort_index()
            ### Dropping NA values:
            ser_stock.loc[idx_isna] = np.NaN
        ### Seasonality adjustment:
        if (bool_perform_sa & (ser_flags['SA_Status'].strip(' ') != 'SA')):
            print(str_index_name, ': Seasonality adjustment')            
            ### Filling empty values:            
            ser_stock = ser_stock.groupby('Observation_Date').ffill()
            ser_stock = ser_stock.groupby('Observation_Date').apply(perform_x13_sa).swaplevel().sort_index()
#            ser_stock = transformParallel(ser_stock.groupby('Observation_Date'), perform_x13_sa).swaplevel().sort_index()       
            ### Dropping NA values:
            ser_stock.loc[idx_isna] = np.NaN    
        ### Transforming to PoP series:
        if (ser_flags['Processing'] == 'Index'):
            ### Debasing only:
            print(str_index_name, ': Transformation to MoM series: Debasing')            
            ser_mom = ser_stock - ser_flags['Base']           
        elif (ser_flags['Processing'] in ['Flow', 'Level']):    
            ### Simple difference:
            print(str_index_name, ': Transformation to MoM series: Simple difference')
            ### Shifting lag defining:
            if (ser_flags['Frequency'] in dict_mom_shift.keys()):
                int_mom_shift = dict_mom_shift[ser_flags['Frequency']]
            else:
                int_mom_shift = dict_mom_shift['Other']            
            ### Stock-like series differing:
            ser_mom = ser_stock.groupby('Observation_Date', group_keys = False).apply(lambda ser_obs_date: ser_obs_date - ser_obs_date.shift(int_mom_shift))
        else:      
            ### Difference with dividing:
            print(str_index_name, ': Transformation to MoM series: Difference with dividing')
            ### Shifting lag defining:
            if (ser_flags['Frequency'] in dict_mom_shift.keys()):
                int_mom_shift = dict_mom_shift[ser_flags['Frequency']]
            else:
                int_mom_shift = dict_mom_shift['Other']
            ### Stock-like series differing:
            ser_mom = ser_stock.groupby('Observation_Date', group_keys = False)\
                               .apply(lambda ser_obs_date: (ser_obs_date / ser_obs_date.shift(int_mom_shift) - 1))  
        ser_mom.name = 'MoM'
        ### Negative flag check:
        if (ser_flags['Negative'] == 1):
            ser_mom = -ser_mom
        ### Z-scoring across the observation dates:
        print(ser_name.index.get_level_values(0)[0], ': Z-scoring across the observation dates')
        ### To stop rebasing when basic ticker (std, mean) are close to (1, 0):
        list_continue_rebase = [True] 
        ### Adjusting Z-score period limit for some groups:
        if (ser_flags['Data_Source'] == 'Markit'):
            int_min_years_adj = int_min_years - 1
        else:
            int_min_years_adj = int_min_years    
        ### Z-score tranformation:
        ser_mom_z = ser_mom.groupby('Observation_Date')\
                           .transform(by_date_z_score, int_winsorize_bound, flo_winsorize_tolerance, int_winsorize_steps_limit, int_min_years_adj,
                                      str_path_bb_matrix_hdf, ser_flags['Basic_Ticker'], ser_flags['Rebase_Flag'], list_continue_rebase)       
#        ser_mom_z = transformParallel(ser_mom.groupby('Observation_Date'), by_date_z_score_to_parallel).sort_index() ### Parallelization attempt
        ### Adding results to matrix cube:
        pd.concat([ser_mom_z], keys = [str_index_name], names = ['Index_Name']).to_hdf(str_path_bb_matrix_hdf, key = str_key_matrix_z, format = 'table',
                                                                                       complevel = 9, append = True, mode = 'a',
                                                                                       min_itemsize = {'Index_Name': int_max_name_length})    
#    ### Results output:
#    return pd.concat([ser_mom_z], keys = [str_index_name], names = ['Index_Name'])

gc.collect()
### Flags loading:
df_flags_typed = pd.read_hdf(str_path_bb_idx_hdf, key = str_key_flags_typed)
df_flags_other = df_flags_typed[(df_flags_typed['Type_Prime'] != 'ANT') & (df_flags_typed['Processing'] != 'Index')]
### Economic Indices vector loading:
ser_history_bday = pd.read_hdf(str_path_bb_idx_hdf, key = str_key_bday_history)
ser_history_other = ser_history_bday.reindex(df_flags_other.index, level = 'Index_Name')#.loc[['GRFRIAMM Index', 'ITVHYOY Index', 'NERS20Y Index'], All, All]
### Previous HDF file deleting:
if os.path.isfile(str_path_bb_matrix_mom_hdf):
    os.remove(str_path_bb_matrix_mom_hdf)
### Maximum length calculating (for HDF manipulations):
int_max_name_length = max(ser_history_other.index.levels[0].str.len())
### Data transforming:
ser_history_other.groupby('Index_Name', group_keys = False, sort = False)\
        .apply(complex_transform, idx_date_range, df_flags_typed, int_max_name_length, int_min_years_z_score, str_path_bb_matrix_mom_hdf, bool_perform_sa = True)

ECCPEMUM Index : Reindexation
ECCPEMUM Index : Transformation to stock-like series: Cumulative product
ECCPEMUM Index : Seasonality adjustment
ECCPEMUM Index : Transformation to MoM series: Difference with dividing
ECCPEMUM Index : Z-scoring across the observation dates
EUGNEMUQ Index : Reindexation
EUGNEMUQ Index : TAR Primary Type ignoring
EUITEMUM Index : Reindexation
EUPPEMUM Index : Reindexation
EUPPEMUM Index : Transformation to stock-like series: Cumulative product
EUPPEMUM Index : Seasonality adjustment
SA error :  EUPPEMUM Index  :  1988-01-05  :  X13Error
SA error :  EUPPEMUM Index  :  1988-02-05  :  X13Error
SA error :  EUPPEMUM Index  :  2009-02-03  :  X13Error
SA error :  EUPPEMUM Index  :  2009-07-02  :  X13Error
SA error :  EUPPEMUM Index  :  2009-10-02  :  X13Error
SA error :  EUPPEMUM Index  :  2010-02-02  :  X13Error
SA error :  EUPPEMUM Index  :  2010-03-02  :  X13Error
SA error :  EUPPEMUM Index  :  2010-04-07  :  X13Error
SA error :  EUPPEMUM Index  :  2010-08-03  

In [None]:
### RUN TO RE-EXPORT DATA: Z_SCORED EI DIAGONAL CONSTRUCTING (2 LINES TO MODIFY)

### Economic Indices vector loading:
ser_history_bday = pd.read_hdf(str_path_bb_idx_hdf, key = str_key_bday_history)
### Tickers list preparing:
idx_ticker_list = ser_history_bday.index.levels[0]
### Internal links between matrix files and diagonal keys:
dict_diag_link = {}
dict_diag_link[str_key_diag_daily_mom] = str_path_bb_matrix_mom_hdf
### Looping over transformation ways:
for iter_way in dict_diag_link:
    ### File linking:
    str_path_bb_matrix_hdf = dict_diag_link[iter_way]
    ### Creating container for tickers diagonals:
    gc.collect()
    dict_ei_diag = {}
    ### Looping over tickers:
    for iter_ticker in idx_ticker_list:
        ### Loading matrix for each ticker:
        ser_iter_matrix = pd.read_hdf(str_path_bb_matrix_hdf, key = str_key_matrix_z, where = 'Index_Name == iter_ticker')
        ### Excluding TAR tickers:
        if (len(ser_iter_matrix) > 0):
            print(iter_way, ':', iter_ticker, 'performing')
            ser_iter_matrix = ser_iter_matrix.droplevel('Index_Name')
            ### Extracting unique observation dates for each ticker later than diagonal start date:
            idx_date_list = ser_iter_matrix.loc[All, date_diag_start : ].index.dropna().get_level_values('Observation_Date').unique()
            ### Creating future diagonal vector:
            ser_iter_diag = pd.Series(np.NaN, idx_date_list)    
            ### Determining first valid date and first date to place z-scoring results on diagonal:
            ### Looping over unique dates:
            for iter_date in idx_date_list:
                ### Trying to get the latest data date observation:
                try:
                    ser_iter_diag[iter_date] = ser_iter_matrix.loc[All, iter_date].dropna().iloc[-1] ### LINE TO MODIFY
                except:
                    pass
            ### Checking for data earlier than diagonal start date:
            if ((ser_iter_matrix.index.get_level_values('Observation_Date').unique().min() < date_diag_start) & pd.notna(ser_iter_diag.values[0])):
                ### Turning diagonal start date column to diagonal with data dates as index dates:
                ser_iter_start_col = ser_iter_matrix.loc[ : idx_date_list[0], [idx_date_list[0]]]
                ### Ticker values on the date to turn:
                list_iter_values_to_turn = ser_iter_start_col.values
                ### Implementing lag before the announcement for the first valid observation date (3 years of data dates with equal first valid index):
                int_release_lag = (ser_iter_start_col.dropna().index[-1][-1] - ser_iter_start_col.dropna().index[-1][0]).days
                ### Taking announcement dates as an index for first column values:
                list_iter_index_to_turn = ser_iter_start_col.index.get_level_values('Data_Date') + pd.offsets.Day(int_release_lag)
                ### Modified column to turn:
                ser_iter_to_turn = pd.Series(list_iter_values_to_turn, index = list_iter_index_to_turn)
                ### Dropping repeated dates and empty dates:
                ser_iter_to_turn = ser_iter_to_turn.groupby(level = 0).apply(lambda ser_date: ser_date.iloc[-1]).dropna() ### LINE TO MODIFY
                ### Cutting series not to intersect with diagonal index:
                ser_iter_to_turn = ser_iter_to_turn.loc[ : idx_date_list[0] - pd.offsets.Day(1)]
                ### Joining series:
                ser_iter_diag = pd.concat([ser_iter_to_turn, ser_iter_diag], axis = 0).sort_index() 
            ### Reindexation to business daily vector and forward filling:
            ser_iter_diag = ser_iter_diag.ffill().reindex(idx_date_range).ffill()
            ### Saving ticker diagonal to the container:
            dict_ei_diag[iter_ticker] = ser_iter_diag
        else:
            print(iter_way, ':', iter_ticker, 'is absent for this way')
    ### Aggregating ticker diagonals:
    ser_diagonal_z = pd.concat(dict_ei_diag, axis = 0)
    ser_diagonal_z.index.names = ['Index_Name', 'Date']
    ser_diagonal_z.name = 'EI_diagonal'
    ### Saving results to hdf file:
    ser_diagonal_z.to_hdf(str_path_bb_diag_hdf, key = iter_way, mode = 'a')

In [None]:
### RUN TO AGGREGATE: GROUP DATA CONSOLIDATION (CELL TO REPLACE)

### Defining group aggregation function:
def group_aggregate(ser_group_list, str_path_group_matrix_hdf):
    ### Defining triangle extraction:
    def triangle_filter(ser_date):
        ### Extracting particular Data Date:
        date_diag = ser_date.index.get_level_values('Data_Date')[0]
        ### Dropping constant level:
        ser_result = ser_date.droplevel(['Index_Name', 'Data_Date'])
        ### Filtering over-diagonal values:
        ser_result = ser_result[ser_result.index >= date_diag] 
        ### Results output:
        return ser_result 
    def conditional_fill(ser_ticker, int_limit = int_fill_limit):
        ### Category loading:
        str_category = df_flags_typed.loc[ser_ticker.index[0][0], 'Category']
        if (str_category == 'Leading'):
            ser_filled = ser_ticker.groupby('Observation_Date').ffill(limit = int_limit)
        else:
            ser_filled = ser_ticker.groupby('Observation_Date').bfill(limit = int_limit).groupby('Observation_Date').ffill(limit = int_limit)
        ### Results output:
        return ser_filled   
    ### Defining time-vector z-scoring procedure:    
    def by_date_z_score(ser_date, int_winsorize_bound, flo_tolerance, int_winsorize_steps_limit, int_min_years_adj):
        ### Check for empty vector (doing nothing):
        if ser_date.count():
            ### Check for non-constant vector:
            if (ser_date.std() > flo_tolerance):
                ### Check for minimal quantity of observations to z-score:
                if (ser_date.last_valid_index()[0] - ser_date.first_valid_index()[0]).days >= (int_min_years_adj * 365):   
                    ### Calculating of z scores:
                    ser_date = (ser_date - ser_date.mean()) / ser_date.std()        
                    bool_to_winsor = True   
                    int_iter = 1
                    while (bool_to_winsor): 
                        int_iter += 1                
                        ### Value based winsorization:                
                        ser_date.clip(lower = -int_winsorize_bound, upper = int_winsorize_bound, inplace = True)
                        ### Recalculating of z scores:
                        ser_date = (ser_date - ser_date.mean()) / ser_date.std()
                        ### Checking for boundaries and steps:
                        if((ser_date.loc[ser_date.abs() >= (int_winsorize_bound + flo_tolerance)].count() == 0) | (int_iter > int_winsorize_steps_limit)):
                            bool_to_winsor = False
                else:
                    ### Killing values that we can't z-score
                    ser_date.loc[All] = np.NaN
            else:
                ### Check for minimal quantity of observations to z-score:
                if (ser_date.last_valid_index()[0] - ser_date.first_valid_index()[0]).days >= (int_min_years_adj * 365):             
                    ### Constant values demeaning:
                    ser_date = ser_date - ser_date.mean()
                else:
                    ### Killing values that we can't z-score
                    ser_date.loc[All] = np.NaN
        ### Memory optimization:
        ser_date = ser_date.astype('float32')
        return ser_date  
    ### Group aggregation announce:
    print(str_path_group_matrix_hdf, ':', ser_group_list.index[0], 'group average matrix construction started')    
    ### Extracting group ticker names:
    list_group_members = ser_group_list.to_list()
    ### Creating ticker data container:
    list_group_matrix = []
    ### Looping over tickers to collect group data:
    for iter_ticker in list_group_members:
        list_group_matrix.append(pd.read_hdf(dict_group_link[str_path_group_matrix_hdf], key = str_key_matrix_z, where = 'Index_Name == iter_ticker'))
    ### Group data aggregating:
    ser_group_matrix = pd.concat(list_group_matrix)
    ### Group have more than one member:
    if (len(list_group_members) > 1):    
        ### Union of observation dates defining:
        idx_observation_range = ser_group_matrix.index.get_level_values('Observation_Date').unique().sort_values()       
#        idx_observation_bm = pd.date_range(start = idx_observation_range[0], end = idx_observation_range[-1], freq = 'BM')
#        idx_observation_range = idx_observation_range.union(idx_observation_bm).unique().sort_values()        
        idx_observation_range.name = 'Observation_Date'
        ### Reindexation of observation dates:
        ser_obs_range = ser_group_matrix.groupby('Index_Name')\
                                        .apply(lambda ser_name: ser_name.droplevel('Index_Name').unstack('Data_Date').reindex(idx_observation_range)\
                                                                        .stack('Data_Date', dropna = False).squeeze().swaplevel().sort_index())
        gc.collect
        ### Filling for each data date:
        ser_obs_range = ser_obs_range.groupby(['Index_Name', 'Data_Date']).ffill()
        ### Union of event dates defining:
        idx_event_range = ser_obs_range.index.get_level_values('Data_Date').unique().sort_values()
        ### Reindexation of observation dates:
        ser_event_range = ser_obs_range.groupby('Index_Name')\
                                       .apply(lambda ser_name: ser_name.droplevel('Index_Name').unstack('Observation_Date').reindex(idx_event_range)\
                                                                       .stack('Observation_Date', dropna = False).squeeze().sort_index())
        del ser_obs_range
        gc.collect        
        ### Filling for each data date:
        ser_event_range = ser_event_range.groupby(['Index_Name'], group_keys = False).apply(conditional_fill, int_fill_limit) 
        ### Cutting by the diagonal:
        ser_triangle = ser_event_range.groupby(['Index_Name', 'Data_Date'], observed = True).apply(triangle_filter).sort_index() 
        del ser_event_range
        gc.collect          
        ### Weights list initialising:
        list_ext_weights = [False]        
        ### Group average taking:
        ser_average = ser_triangle.unstack('Index_Name').mean(axis = 1)
        ser_average.name = 'Average'
        del ser_triangle
        gc.collect         
        ### Z-scoring for each observation date:
        ser_average_z = ser_average.groupby('Observation_Date')\
                                   .transform(by_date_z_score, int_winsorize_bound, flo_winsorize_tolerance, int_winsorize_steps_limit, int_min_years_z_score) 
        del ser_average
        gc.collect    
        ### Adding group data to hdf file:
        pd.concat([ser_average_z], keys = [ser_group_list.index[0]], names = ['Type_Prime', 'Sub_Type', 'Region'])\
                                .to_hdf(str_path_group_matrix_hdf, key = str_key_group_matrix, format = 'table', complevel = 9, append = True, mode = 'a',
                                        min_itemsize = {'Type_Prime': int_max_type_prime_len, 'Sub_Type': int_max_sub_type_len, 'Region': int_max_region_len})
#        ### Results output:
#        return ser_triangle        
    else:
        ser_group_matrix.name = 'Average'
        ### Adding group data to hdf file:
        pd.concat([ser_group_matrix.droplevel('Index_Name')], keys = [ser_group_list.index[0]], names = ['Type_Prime', 'Sub_Type', 'Region'])\
                                .to_hdf(str_path_group_matrix_hdf, key = str_key_group_matrix, format = 'table', complevel = 9, append = True, mode = 'a',
                                        min_itemsize = {'Type_Prime': int_max_type_prime_len, 'Sub_Type': int_max_sub_type_len, 'Region': int_max_region_len})
#        ### Results output:
#        return ser_group_matrix.droplevel('Index_Name')        
    ### Success message:
    print(str_path_group_matrix_hdf, ':', ser_group_list.index[0], 'group average matrix successfully added to file')
    
### Flags loading & removing tickers with seasonality adjustment failed:
list_SA_failed = ['JNPIY Index', 'SLPRYOYA Index']
df_flags_typed = pd.read_hdf(str_path_bb_idx_hdf, key = str_key_flags_typed).drop(list_SA_failed, axis = 0)
### Primary type excluding ANT list:
list_pop_types = list(df_flags_typed['Type_Prime'].unique())
list_pop_types.remove('ANT')
list_pop_types.remove('TAR')
### Length limits for levels determination:
int_max_type_prime_len = df_flags_typed['Type_Prime'].str.len().max(axis = 0)
int_max_sub_type_len = df_flags_typed['Sub_Type'].str.len().max(axis = 0)
int_max_region_len = df_flags_typed['Region'].str.len().max(axis = 0)
### Flags converting to group register:
ser_group_register = df_flags_typed[['Type_Prime', 'Sub_Type', 'Region']].reset_index().set_index(['Type_Prime', 'Sub_Type', 'Region']).squeeze()
### Internal links between matrix files and group files:
dict_group_link = {}
dict_group_link[str_path_group_matrix_mom_hdf] = str_path_bb_matrix_mom_hdf
for str_path_group_matrix_hdf in dict_group_link: # [str_path_group_matrix_mom_hdf]: # 
    gc.collect()
    ### Previous HDF file deleting:
    if os.path.isfile(str_path_group_matrix_hdf):
        os.remove(str_path_group_matrix_hdf)
    if (str_path_group_matrix_hdf == str_path_group_matrix_ant_hdf):
        ser_iter_register = ser_group_register.loc[['ANT'], All, All]
    else: 
        ser_iter_register = ser_group_register.loc[list_pop_types, All, All]        
    ### Region average matrix aggregating:
    ser_iter_register.groupby(['Type_Prime', 'Sub_Type', 'Region'], group_keys = True, sort = False).apply(group_aggregate, str_path_group_matrix_hdf)

In [None]:
### RUN TO RE-EXPORT DATA: REGIONS DIAGONAL CONSTRUCTING (1 LINE TO MODIFY)

### Flags loading:
df_flags_typed = pd.read_hdf(str_path_bb_idx_hdf, key = str_key_flags_typed)
### Internal links between matrix files and diagonal keys:
dict_diag_link = {}
dict_diag_link[str_key_diag_group_mom] = str_path_group_matrix_mom_hdf

for iter_way in dict_diag_link:
    ### File linking:
    str_path_group_matrix_hdf = dict_diag_link[iter_way]
    ### Creating container for groups diagonals:
    list_groups_diag = []
    ### Looping over groups:
    for iter_group in df_flags_typed[['Type_Prime', 'Sub_Type', 'Region']].drop_duplicates().sort_values(['Type_Prime', 'Sub_Type', 'Region']).values:
        ser_iter_matrix = pd.read_hdf(str_path_group_matrix_hdf, key = str_key_group_matrix, 
                                      where = '(Type_Prime == iter_group[0]) & (Sub_Type == iter_group[1]) & (Region == iter_group[2])')\
                            .droplevel(['Type_Prime', 'Sub_Type', 'Region'])
        ### Excluding TAR tickers:
        if (len(ser_iter_matrix) > 0):
            print(iter_way, ':', iter_group, 'performing') 
            ### Extracting unique observation dates for each ticker later than diagonal start date:
            idx_date_list = ser_iter_matrix.loc[All, date_diag_start : ].index.dropna().get_level_values('Observation_Date').unique()
            ### Creating future diagonal vector:
            ser_iter_diag = pd.Series(np.NaN, idx_date_list)    
            ### Determining first valid date and first date to place z-scoring results on diagonal:
            ### Looping over unique dates:
            for iter_date in idx_date_list:
                ### Trying to get the latest data date observation:
                try:
                    ser_iter_diag[iter_date] = ser_iter_matrix.loc[All, iter_date].dropna().iloc[-1] ### LINE TO MODIFY
                except:
                    pass
            ### Checking for data earlier than diagonal start date:
            if ((ser_iter_matrix.index.get_level_values('Observation_Date').unique().min() < date_diag_start) & pd.notna(ser_iter_diag.values[0])):
                ### Selecting column to be turned:
                ser_iter_to_turn = ser_iter_matrix.loc[ : idx_date_list[0] - pd.offsets.Day(1), [idx_date_list[0]]].droplevel('Observation_Date')
                ### Joining series:
                ser_iter_diag = pd.concat([ser_iter_to_turn, ser_iter_diag], axis = 0).sort_index() 
            ### Reindexation to business daily vector and forward filling:
            ser_iter_diag = ser_iter_diag.ffill().reindex(idx_date_range).ffill()
            ### Saving ticker diagonal to the container:
            list_groups_diag.append(pd.concat([ser_iter_diag], keys = [tuple(iter_group)], names = ['Type_Prime', 'Sub_Type', 'Region']))
            ser_iter_diag.name = '/'.join(iter_group)
#            ser_iter_diag.plot(figsize = (15, 5))
#            plt.show()
#        else:
#            print(iter_way, ':', iter_group, 'is not for this way')
    ### Container converting and adding to HDF:
    pd.concat(list_groups_diag, axis = 0).to_hdf(str_path_bb_diag_hdf, key = iter_way, mode = 'a')

In [None]:
### RUN TO RE-EXPORT DATA: REGIONAL MATRICES AGGREGATION TO SUB TYPE MATRICES (CELL TO REPLACE)

### Defining triangle extraction:
def triangle_filter(ser_date):
    ### Extracting particular Data Date:
    date_diag = ser_date.index.get_level_values('Data_Date')[0]
    ### Dropping constant level:
    ser_result = ser_date.droplevel('Data_Date')
    ### Filtering over-diagonal values:
    ser_result = ser_result[ser_result.index >= date_diag] 
    ### Results output:
    return ser_result
### Defining time-vector z-scoring procedure:    
def by_date_z_score(ser_date, int_winsorize_bound, flo_tolerance, int_winsorize_steps_limit, int_min_years_adj):
    ### Check for empty vector (doing nothing):
    if ser_date.count():
        ### Check for non-constant vector:
        if (ser_date.std() > flo_tolerance):
            ### Check for minimal quantity of observations to z-score:
            if (ser_date.last_valid_index()[0] - ser_date.first_valid_index()[0]).days >= (int_min_years_adj * 365):   
                ### Calculating of z scores:
                ser_date = (ser_date - ser_date.mean()) / ser_date.std()        
                bool_to_winsor = True   
                int_iter = 1
                while (bool_to_winsor): 
                    int_iter += 1                
                    ### Value based winsorization:                
                    ser_date.clip(lower = -int_winsorize_bound, upper = int_winsorize_bound, inplace = True)
                    ### Recalculating of z scores:
                    ser_date = (ser_date - ser_date.mean()) / ser_date.std()
                    ### Checking for boundaries and steps:
                    if((ser_date.loc[ser_date.abs() >= (int_winsorize_bound + flo_tolerance)].count() == 0) | (int_iter > int_winsorize_steps_limit)):
                        bool_to_winsor = False
            else:
                ### Killing values that we can't z-score
                ser_date.loc[All] = np.NaN
        else:
            ### Check for minimal quantity of observations to z-score:
            if (ser_date.last_valid_index()[0] - ser_date.first_valid_index()[0]).days >= (int_min_years_adj * 365):             
                ### Constant values demeaning:
                ser_date = ser_date - ser_date.mean()
            else:
                ### Killing values that we can't z-score
                ser_date.loc[All] = np.NaN
    ### Memory optimization:
    ser_date = ser_date.astype('float32')
    return ser_date                
### Defining region averaging:
def sub_type_aggregate(ser_iter_sub, str_path_sub_matrix_hdf):
    gc.collect()
    print(str_path_sub_matrix_hdf, ':', ser_iter_sub.index[0][: 2], 'aggregation procedure started')
    list_iter_weights = ser_iter_sub.values    ### Looping over regions:
    for iter_num, iter_index in enumerate(ser_iter_sub.index):
        print(str_path_sub_matrix_hdf, ':', iter_index, ser_iter_sub[iter_index])
        ### Loading group matrix:
        ser_iter_group = pd.read_hdf(dict_group_link[str_path_sub_matrix_hdf], key = str_key_group_matrix, 
                                where = '(Type_Prime == iter_index[0]) & (Sub_Type == iter_index[1]) & (Region == iter_index[2])')\
                            .droplevel(['Type_Prime', 'Sub_Type', 'Region'])
        print(str_path_sub_matrix_hdf, ':', iter_index, 'matrix loaded')
        ### Observation dates reindexation and forward filling for every event date:
        gc.collect()
        ser_iter_group = ser_iter_group.unstack('Data_Date').reindex(idx_obs_range).ffill(axis = 0).stack('Data_Date', dropna = False).swaplevel().sort_index()
        ser_iter_group.index.names = ['Data_Date', 'Observation_Date']
        print(str_path_sub_matrix_hdf, ':', iter_index, 'observation dates reindexed')        
        ### Event dates reindexation and forward filling for every observation date:
        gc.collect()
        ser_iter_group = ser_iter_group.unstack('Observation_Date').reindex(idx_event_range).ffill(axis = 0).stack('Observation_Date', dropna = False).sort_index()
        ser_iter_group.index.names = ['Data_Date', 'Observation_Date']
        print(str_path_sub_matrix_hdf, ':', iter_index, 'event dates reindexed')        
        ### Triangle filtering:
        gc.collect()
        ser_iter_group = ser_iter_group.astype('float16').groupby('Data_Date').apply(triangle_filter)
        ### Creating dataframe for the future averaging:
        if (iter_num == 0):
            ser_iter_group.name = iter_index[2]
            df_iter_sub = ser_iter_group.to_frame()
        ### Adding column to existing dataframe:    
        else:
            df_iter_sub[iter_index[2]] = ser_iter_group.values
        del ser_iter_group
        gc.collect()
        print(str_path_sub_matrix_hdf, ':', iter_index, 'matrix added to table')
    ### Sub type average calculating:
    ser_mean = columns_average(df_iter_sub, list_iter_weights)
    del df_iter_sub
    gc.collect()
    print(str_path_sub_matrix_hdf, ':', iter_index[: 2], 'mean calculated')
    ### Sub type average z-scoring:
    ser_mean_z = ser_mean.groupby('Observation_Date').transform(by_date_z_score, int_winsorize_bound, flo_winsorize_tolerance, int_winsorize_steps_limit, 
                                                              int_min_years_z_score)
    del ser_mean
    gc.collect()    
    print(str_path_sub_matrix_hdf, ':', iter_index[: 2], 'mean z-scored')
    ### Sub type average adding to the HDF file:
    pd.concat([ser_mean_z], keys = [iter_index[: 2]], names = ['Type_Prime', 'Sub_Type'])\
                                    .to_hdf(str_path_sub_matrix_hdf, key = str_key_sub_matrix, format = 'table', complevel = 9, append = True, mode = 'a',
                                            min_itemsize = {'Type_Prime': int_max_type_prime_len, 'Sub_Type': int_max_sub_type_len}) 
    print(str_path_sub_matrix_hdf, ':', iter_index[: 2], 'z-scored matrix saved')    
#    ### Results output:    
#    return ser_mean_z

### Garbage collecting:
gc.collect()
### Flags loading:
list_SA_failed = ['JNPIY Index', 'SLPRYOYA Index']
df_flags_typed = pd.read_hdf(str_path_bb_idx_hdf, key = str_key_flags_typed).drop(list_SA_failed, axis = 0)
### Primary type excluding ANT list:
list_pop_types = list(df_flags_typed['Type_Prime'].unique())
list_pop_types.remove('ANT')
list_pop_types.remove('TAR')
### Region weights adopting:
ser_region_weight = pd.Series(dict_region_weight)
ser_region_weight.name = 'Region'
### Extracting group indices to groupby:
ser_sub_type = df_flags_typed[['Type_Prime', 'Sub_Type', 'Region']].drop_duplicates().sort_values(['Type_Prime', 'Sub_Type'])\
                                                    .reset_index().set_index(['Type_Prime', 'Sub_Type']).drop('Index_Name', axis = 1).squeeze()
### GDP Dropping:
ser_sub_type = ser_sub_type.drop('TAR', level = 'Type_Prime')
### Adding region weights to group info:
ser_sub_type = ser_sub_type.to_frame().set_index('Region', append = True).join(ser_region_weight, on = 'Region').squeeze().sort_index()
ser_sub_type.name = 'Weight'
### Loading dates indices:
idx_event_range = pd.read_hdf(str_path_overall_dates_hdf, key = str_key_event_dates).index
idx_obs_range = pd.read_hdf(str_path_overall_dates_hdf, key = str_key_obs_dates).index
### Length limits for levels determination:
int_max_type_prime_len = ser_sub_type.index.levels[0].str.len().max()
int_max_sub_type_len = ser_sub_type.index.levels[1].str.len().max()
### Internal links between region files and sub type files:
dict_group_link = {}
dict_group_link[str_path_sub_matrix_mom_hdf] = str_path_group_matrix_mom_hdf

for str_path_sub_matrix_hdf in dict_group_link:
    ### Previous HDF file deleting:
    if os.path.isfile(str_path_sub_matrix_hdf):
        os.remove(str_path_sub_matrix_hdf)
    if (str_path_sub_matrix_hdf == str_path_sub_matrix_ant_hdf):
        ser_iter_sub_type = ser_sub_type.loc[['ANT'], All, All]
    else:
        ser_iter_sub_type = ser_sub_type.loc[sorted(list_pop_types), All] 
    ### Aggregation performing:
    ser_iter_sub_type.groupby(['Type_Prime', 'Sub_Type']).apply(sub_type_aggregate, str_path_sub_matrix_hdf)

In [None]:
### RUN TO RE-EXPORT DATA: Z_SCORED SUB TYPES DIAGONAL CONSTRUCTING (1 LINE TO MODIFY)

### Flags loading:
df_flags_typed = pd.read_hdf(str_path_bb_idx_hdf, key = str_key_flags_typed)
### Chunk size defining:
int_chunksize = 10 ** 6
### Internal links between matrix files and diagonal keys:
dict_diag_link = {}
dict_diag_link[str_key_diag_sub_mom] = str_path_sub_matrix_mom_hdf

for iter_way in dict_diag_link:
    ### File linking:
    str_path_sub_matrix_hdf = dict_diag_link[iter_way]
    ### Creating container for groups diagonals:
    list_groups_diag = []
    ### Looping over sub types:
    for iter_group in df_flags_typed[['Type_Prime', 'Sub_Type']].drop_duplicates().sort_values(['Type_Prime', 'Sub_Type']).values:
        ### Check for not GDP:
        if (iter_group[0] != 'TAR'):
            ### Iteration container preparing:
            gc.collect()
            list_iter_container = []
            ### Sub type matrix loading:
            print(iter_way, ':', iter_group, 'loading') 
            for iter_chunk in pd.read_hdf(str_path_sub_matrix_hdf, key = str_key_sub_matrix, chunksize = int_chunksize, 
                                          where = '(Type_Prime == iter_group[0]) & (Sub_Type == iter_group[1]) & (Observation_Date >= date_diag_start)'):
                list_iter_container.append(iter_chunk)
                del iter_chunk
                gc.collect()    
            ### Excluding TAR tickers:
            if (len(list_iter_container) > 0):                 
                ### Sub type matrix constructing:
                ser_iter_matrix = pd.concat(list_iter_container, axis = 0).droplevel(['Type_Prime', 'Sub_Type']).sort_index()
                del list_iter_container
                gc.collect()           
                print(iter_way, ':', iter_group, 'main diagonal part constructing') 
                ### Extracting unique observation dates for each ticker later than diagonal start date:
                idx_date_list = ser_iter_matrix.index.get_level_values('Observation_Date').unique()
                ### Creating future diagonal vector:
                ser_iter_diag = ser_iter_matrix.groupby('Observation_Date')\
                                               .apply(lambda ser_obs_date: ser_obs_date.dropna().iloc[-1] if (ser_obs_date.count() > 0) else np.NaN) ### LINE TO CHANGE
                ### Checking for data earlier than diagonal start date:
                if (ser_iter_matrix.dropna().loc[All, idx_date_list[0]].index[0] < date_diag_start):
                    ### Selecting column to be turned:
                    print(iter_way, ':', iter_group, 'auxiliary diagonal part constructing') 
                    ser_iter_to_turn = ser_iter_matrix.loc[ : idx_date_list[0] - pd.offsets.Day(1), [idx_date_list[0]]].droplevel('Observation_Date')
                    ### Joining series:
                    ser_iter_diag = pd.concat([ser_iter_to_turn, ser_iter_diag], axis = 0).sort_index() 
                ### Reindexation to business daily vector and forward filling:
                ser_iter_diag = ser_iter_diag.ffill().reindex(idx_date_range).ffill()
                ser_iter_diag = ser_iter_diag.astype('float32')
                ### Saving ticker diagonal to the container:
                list_groups_diag.append(pd.concat([ser_iter_diag], keys = [tuple(iter_group)], names = ['Type_Prime', 'Sub_Type']))
                print(iter_way, ':', iter_group, 'diagonal added to container')
    ### Container converting and adding to HDF:
    ser_sub_type_diag = pd.concat(list_groups_diag, axis = 0)
    ser_sub_type_diag.index.set_names(names = 'Data_Date', level = 2, inplace = True)
    ser_sub_type_diag.to_hdf(str_path_bb_diag_hdf, key = iter_way, mode = 'a')

In [None]:
### RUN TO RE-EXPORT DATA: SUB TYPES MATRICES AGGREGATION TO GROUP MATRICES BY PCA FPC (COVARIANCE MATRIX NEWEY-WEST ADJUSTMENT + LIMITED Z-SCORING) (2 LINES TO MODIFY)

### Defining dataframe columns PCA performing:
def single_date_pca(df_iter_date, int_min_years_pca, list_list_weights, bool_do_nw_adj = True, list_dict_weights = False, list_dict_corrs = False):
    ### Dropping constant level:
    date_iter_obs = df_iter_date.index[0][-1]
    df_iter_date = df_iter_date.droplevel('Observation_Date')
    ### Weights vector initialising:
    if (isinstance(list_dict_weights, bool) == True):
        bool_collect_weights = False
    else:
        bool_collect_weights = True
        ser_iter_weights = pd.Series(np.NaN, index = df_iter_date.columns)    
    ### Correlation matrix collection flag initialising:
    if (isinstance(list_dict_corrs, bool) == True):
        bool_collect_corrs = False
    else:
        bool_collect_corrs = True          
    ### Check for not empty observation date vector:
    if (len(df_iter_date.dropna(how = 'all').index) > 0):
        ### Dropping columns that does not have enough data length:
        for iter_col in df_iter_date.columns:
            if (df_iter_date[iter_col].count() == 0):
                df_iter_date.drop(iter_col, axis = 1, inplace = True)
            elif (df_iter_date[iter_col].dropna().last_valid_index() - df_iter_date[iter_col].dropna().first_valid_index()).days < (int_min_years_pca * 365):
                df_iter_date.drop(iter_col, axis = 1, inplace = True)
        ### No columns (all sub types dropped) check:
        if (len(df_iter_date.columns) == 0):
            ser_iter_res = pd.Series(np.NaN, index = df_iter_date.index)
            if (date_iter_obs == (date_iter_obs + pd.offsets.BMonthEnd(0))):
                print(date_iter_obs.strftime('%Y-%m-%d'), ': All sub types are too short')
        ### Single column (single sub type) check:            
        elif (len(df_iter_date.columns) == 1):
            ser_iter_res = df_iter_date.squeeze()
            ### Weights vector filling:
            if bool_collect_weights:
                ser_iter_weights.loc[df_iter_date.columns[0]] = 1.0
        ### Two columns (sinple mean) check:            
        elif (len(df_iter_date.columns) == 2):
            ser_iter_res = df_iter_date.mean(axis = 1)
            ### Weights vector filling:    
            if bool_collect_weights:
                ser_iter_weights.loc[df_iter_date.columns] = [0.5, 0.5]
            ### Weights vector filling:    
            if bool_collect_corrs:
                df_iter_corr = df_iter_date.resample('B').ffill()[- int_corr_tail * int_bus_year :].corr()
                df_iter_corr[df_iter_corr < 0.0] = 0.0  
                list_dict_corrs[0][date_iter_obs] = df_iter_corr.stack(dropna = False)   
        ### More than one vectors to aggregate:
        else:
            ### Check if we need to calculate new weights (if observation date is BusinessMonthEnd):
            if ((date_iter_obs == (date_iter_obs + pd.offsets.BMonthEnd(0))) | (isinstance(list_list_weights[0], bool) == True)):
                if bool_do_nw_adj:
                    ### Samples correlation matrices collection:
                    dict_sample_corr = {}
                    ### Perform covariance adjustment for subsamples:
                    df_iter_daily = df_iter_date.resample('B').ffill()[- int_corr_tail * int_bus_year : ]
                    for iter_sample in range(int_cov_samples):
                        df_iter_sample = df_iter_daily[iter_sample :: int_cov_samples]
                        ### Looping over lags:
                        for iter_lag in range(int_n_w_lag + 1):
                            df_iter_cov_lagged = pd.DataFrame(np.NaN, index = df_iter_sample.columns, columns = df_iter_sample.columns)
                            ### Looping over sub tupe pairs to calculate lagged covariance
                            for iter_pair in combinations_with_replacement(df_iter_sample.columns, r = 2):
                                ### Lagged covariance calculation:
                                if (iter_lag == 0):
                                    flo_cov_lagged = df_iter_sample[iter_pair[0]].cov(df_iter_sample[iter_pair[1]])
                                else:
                                    flo_cov_lagged = df_iter_sample[iter_pair[0]].shift(iter_lag).cov(df_iter_sample[iter_pair[1]]) + \
                                                     df_iter_sample[iter_pair[1]].shift(iter_lag).cov(df_iter_sample[iter_pair[0]])
                                ### Weight adding:
                                flo_cov_lagged = (1 - iter_lag / (int_n_w_lag + 1)) * flo_cov_lagged
                                ### Adding results to the dataframe:
                                df_iter_cov_lagged.loc[iter_pair[0], iter_pair[1]] = flo_cov_lagged
                                df_iter_cov_lagged.loc[iter_pair[1], iter_pair[0]] = flo_cov_lagged
                                ### Covariance NaN check:
                                if np.isnan(flo_cov_lagged):
                                    print(iter_sample, '/', iter_lag, '/', date_iter_obs.strftime('%Y-%m-%d'), '/', iter_pair[0], '/', iter_pair[1], 
                                          ': NaN covariance')
                            ### Covariance matrix summation:
                            if (iter_lag == 0):
                                df_iter_n_w_cov = df_iter_cov_lagged
                            else:
                                df_iter_n_w_cov = df_iter_n_w_cov + df_iter_cov_lagged
                        ### Standard deviation extracting:
                        ser_iter_n_w_std = pd.Series(np.NaN, index = df_iter_n_w_cov.columns)
                        for iter_col in df_iter_n_w_cov.columns:
                            ser_iter_n_w_std.loc[iter_col] = (df_iter_n_w_cov.loc[iter_col, iter_col]) ** (1/2)
                        ### Correlation matrix calculation:
                        df_iter_n_w_std = ser_iter_n_w_std.to_frame().dot(ser_iter_n_w_std.to_frame().T)
                        ### Samples correlcation matrices summation:
                        dict_sample_corr[iter_sample] = df_iter_n_w_cov / df_iter_n_w_std
                    ### Samples correlation matrices averaging:
#                    df_iter_corr = pd.concat(dict_sample_corr).mean(axis = 0, level = 1)
                    df_iter_corr = pd.concat(dict_sample_corr).groupby(level = 1).mean() ### LINE TO MODIFY
                else:    
#                    ### Simple correlation matrix:
#                    df_iter_corr = df_iter_date.corr()
                    df_iter_corr = df_iter_date.resample('B').ffill()[- int_corr_tail * int_bus_year :].corr()
                ### Negative coefficients correction:
                df_iter_corr[df_iter_corr < 0.0] = 0.0
                ### Adding correlation matrix to collection:
                if bool_collect_corrs:            
                    list_dict_corrs[0][date_iter_obs] = df_iter_corr.stack(dropna = False)                 
                ### PCA weights calculating:
                list_evals, list_evecs = np.linalg.eigh(df_iter_corr)
                df_iter_evecs = pd.DataFrame(data = list_evecs).round(4)
                ### First Principal Component based weighted average calculating:
                list_iter_weights = df_iter_evecs.iloc[:, -1].values
                ### Save calulated weights:
                list_list_weights[0] = list_iter_weights
                print(date_iter_obs.strftime('%Y-%m-%d'), '(', list_iter_weights, ') : New weights saved')
            ### New PCA participant:
            elif (len(df_iter_date.columns) > len(list_list_weights[0])):
                if bool_do_nw_adj:                
                    ### Samples correlation matrices collection:
                    dict_sample_corr = {}
                    ### Perform covariance adjustment for subsamples:
                    df_iter_daily = df_iter_date.resample('B').ffill()[- int_corr_tail * int_bus_year : ]
                    for iter_sample in range(int_cov_samples):
                        df_iter_sample = df_iter_daily[iter_sample :: int_cov_samples]
                        ### Looping over lags:
                        for iter_lag in range(int_n_w_lag + 1):
                            df_iter_cov_lagged = pd.DataFrame(np.NaN, index = df_iter_sample.columns, columns = df_iter_sample.columns)
                            ### Looping over sub tupe pairs to calculate lagged covariance
                            for iter_pair in combinations_with_replacement(df_iter_sample.columns, r = 2):
                                ### Lagged covariance calculation:
                                if (iter_lag == 0):
                                    flo_cov_lagged = df_iter_sample[iter_pair[0]].cov(df_iter_sample[iter_pair[1]])
                                else:
                                    flo_cov_lagged = df_iter_sample[iter_pair[0]].shift(iter_lag).cov(df_iter_sample[iter_pair[1]]) + \
                                                     df_iter_sample[iter_pair[1]].shift(iter_lag).cov(df_iter_sample[iter_pair[0]])
                                ### Weight adding:
                                flo_cov_lagged = (1 - iter_lag / (int_n_w_lag + 1)) * flo_cov_lagged
                                ### Adding results to the dataframe:
                                df_iter_cov_lagged.loc[iter_pair[0], iter_pair[1]] = flo_cov_lagged
                                df_iter_cov_lagged.loc[iter_pair[1], iter_pair[0]] = flo_cov_lagged
                                ### Covariance NaN check:
                                if np.isnan(flo_cov_lagged):
                                    print(date_iter_obs.strftime('%Y-%m-%d'), '/', iter_pair[0], '/', iter_pair[1], ': NaN covariance')
                            ### Covariance matrix summation:
                            if (iter_lag == 0):
                                df_iter_n_w_cov = df_iter_cov_lagged
                            else:
                                df_iter_n_w_cov = df_iter_n_w_cov + df_iter_cov_lagged
                        ### Standard deviation extracting:
                        ser_iter_n_w_std = pd.Series(np.NaN, index = df_iter_n_w_cov.columns)
                        for iter_col in df_iter_n_w_cov.columns:
                            ser_iter_n_w_std.loc[iter_col] = (df_iter_n_w_cov.loc[iter_col, iter_col]) ** (1/2)
                        ### Correlation matrix calculation:
                        df_iter_n_w_std = ser_iter_n_w_std.to_frame().dot(ser_iter_n_w_std.to_frame().T)
                        ### Samples correlcation matrices summation:
                        dict_sample_corr[iter_sample] = df_iter_n_w_cov / df_iter_n_w_std
                    ### Samples correlation matrices averaging:
#                    df_iter_corr = pd.concat(dict_sample_corr).mean(axis = 0, level = 1)
                    df_iter_corr = pd.concat(dict_sample_corr).groupby(level = 1).mean() ### LINE TO MODIFY             
                else:
#                    ### Simple correlation matrix:
#                    df_iter_corr = df_iter_date.corr()
                    ### Das experiment:
                    df_iter_corr = df_iter_date.resample('B').ffill()[- int_corr_tail * int_bus_year : ].corr()
                ### Negative coefficients correction:
                df_iter_corr[df_iter_corr < 0.0] = 0.0
                ### Adding correlation matrix to collection:
                if bool_collect_corrs:            
                    list_dict_corrs[0][date_iter_obs] = df_iter_corr.stack(dropna = False)  
                ### PCA weights calculating:
                list_evals, list_evecs = np.linalg.eigh(df_iter_corr)
                df_iter_evecs = pd.DataFrame(data = list_evecs).round(4)
                ### First Principal Component based weighted average calculating:
                list_iter_weights = df_iter_evecs.iloc[:, -1].values
                ### Save calulated weights:
                list_list_weights[0] = list_iter_weights
                print(date_iter_obs.strftime('%Y-%m-%d'), '(', list_iter_weights, ') : New weights saved')              
            else:
                ### Get weights calculated for the last BusinessMonthEnd:
                list_iter_weights = list_list_weights[0]            
            ### Weighted average calculating:
            ser_iter_res = columns_average(df_iter_date, list_iter_weights)
            ### Different signs of weights check:
            if (min(list_iter_weights) < 0 < max(list_iter_weights)):
                print(date_iter_obs.strftime('%Y-%m-%d'), ': Some weights have different signs')
            ### Weights vector filling:
            ### Sign flipping (if needed):            
            if (sum(list_iter_weights) < 0):
                list_iter_weights = list(map(lambda iter_weight: -1 * iter_weight, list_iter_weights))
            ### Weights normalizing:
            list_iter_weights = list(map(lambda iter_weight: iter_weight / sum(list_iter_weights), list_iter_weights))
            ### Weights collecting:
            if bool_collect_weights:            
                ser_iter_weights.loc[df_iter_date.columns] = list_iter_weights                
    ### If observation vector is empty:
    else:
        ser_iter_res = df_iter_date.iloc[All, 0]
    ### Weights collecting:    
    if bool_collect_weights:
        ser_iter_weights.name = 'FPC'       
        ser_iter_weights = ser_iter_weights.astype('float32')
        list_dict_weights[0][date_iter_obs] = ser_iter_weights
    ### Results output:
    ser_iter_res.name = 'PCA'
    return ser_iter_res.astype('float32')
### Defining time-vector z-scoring procedure:    
def by_date_z_score(ser_date, int_winsorize_bound, flo_tolerance, int_winsorize_steps_limit, int_min_years_adj, int_max_years_adj = 100):
    ### Cutting old values:
    ser_date.loc[ : ser_date.index[-1][0] - pd.DateOffset(months = int_max_years_adj * 12)] = np.NaN
    ### Check for empty vector (doing nothing):
    if ser_date.count():
        ### Check for non-constant vector:
        if (ser_date.std() > flo_tolerance):
            ### Check for minimal quantity of observations to z-score:
            if (ser_date.last_valid_index()[0] - ser_date.first_valid_index()[0]).days >= (int_min_years_adj * 365):   
                ### Calculating of z scores:
                ser_date = (ser_date - ser_date.mean()) / ser_date.std()        
                bool_to_winsor = True   
                int_iter = 1
                while (bool_to_winsor): 
                    int_iter += 1                
                    ### Value based winsorization:                
                    ser_date.clip(lower = -int_winsorize_bound, upper = int_winsorize_bound, inplace = True)
                    ### Recalculating of z scores:
                    ser_date = (ser_date - ser_date.mean()) / ser_date.std()
                    ### Checking for boundaries and steps:
                    if((ser_date.loc[ser_date.abs() >= (int_winsorize_bound + flo_tolerance)].count() == 0) | (int_iter > int_winsorize_steps_limit)):
                        bool_to_winsor = False
            else:
                ### Killing values that we can't z-score
                ser_date.loc[All] = np.NaN
        else:
            ### Check for minimal quantity of observations to z-score:
            if (ser_date.last_valid_index()[0] - ser_date.first_valid_index()[0]).days >= (int_min_years_adj * 365):             
                ### Constant values demeaning:
                ser_date = ser_date - ser_date.mean()
            else:
                ### Killing values that we can't z-score:
                ser_date.loc[All] = np.NaN
    ### Memory optimization:
    ser_date = ser_date.astype('float32')
    return ser_date        

### Flags loading:
df_flags_typed = pd.read_hdf(str_path_bb_idx_hdf, key = str_key_flags_typed)
### Extracting sub types:
ser_type_prime = df_flags_typed[['Type_Prime', 'Sub_Type']].drop_duplicates().sort_values(['Type_Prime', 'Sub_Type'])\
                                                           .reset_index().set_index(['Type_Prime']).drop('Index_Name', axis = 1).squeeze()
### Chunk size defining:
int_chunksize = 10 ** 6 
### Index name limitation:
int_max_global_name = max(map(len, dict_global_index_name.values()))
### Previous HDF weights collection file deleting:
if os.path.isfile(str_path_bb_weights_hdf):
    os.remove(str_path_bb_weights_hdf)
### Previous HDF correlation matrices collection file deleting:
if os.path.isfile(str_path_bb_corrs_hdf):
    os.remove(str_path_bb_corrs_hdf)    
### Looping over global indices combinations:
for iter_combo in dict_global_index_name:   
    gc.collect()
    ### Resulting matrix defining:
    str_path_global_matrix_z_hdf = dict_global_index_hdf[iter_combo]
    ### Global index name defining:
    str_global_name = dict_global_index_name[iter_combo]
    print(str_global_name, ': Calculation started')
    if (isinstance(iter_combo, str)):
        iter_list = [iter_combo]
    else:
        iter_list = list(iter_combo)
#    print(iter_list, '/', str_global_name, '/', str_path_global_matrix_z_hdf)
    ### Previous HDF file deleting:
    if os.path.isfile(str_path_global_matrix_z_hdf):
        os.remove(str_path_global_matrix_z_hdf)
    ### External actual weights list to change:
    list_ext_weights = [False] 
    ### External full history weights list to change:
    list_dict_weights = [{}]
    ### External full history correlation matrices list to change:
    list_dict_corrs = [{}]    
    ### Sub type list for global index initiating:
    dict_sub_types = {}
    ### Looping over global index list elements to prepare info for data extraction loop:
    for iter_member in iter_list:
        ### Source files for element defining:
        if (iter_member == 'ANT'):
            str_path_sub_matrix_hdf = str_path_sub_matrix_ant_hdf
        elif iter_member.endswith('_mom'):
            str_path_sub_matrix_hdf = str_path_sub_matrix_mom_hdf
        ### Element sub types defining
        iter_prime_type = iter_member.split('_')[0]
        list_iter_sub_types = ser_type_prime[[iter_member.split('_')[0]]].values
        for iter_sub_type in list_iter_sub_types:
            dict_sub_types[(iter_prime_type, iter_sub_type)] = str_path_sub_matrix_hdf
    ### First element flag for dataframe initialization by first series:
    bool_first_element = True            
    ### Data extraction loop:
    for iter_type in dict_sub_types:
        ### Iteration container preparing:
        list_iter_container = []
        str_path_sub_matrix_hdf = dict_sub_types[iter_type]
        ### Sub type matrix loading by looping over chunks:
        for iter_chunk in pd.read_hdf(str_path_sub_matrix_hdf, key = str_key_sub_matrix, chunksize = int_chunksize, 
                                      where = '(Type_Prime == iter_type[0]) & (Sub_Type == iter_type[1])'):
#        for iter_chunk in pd.read_hdf(str_path_sub_matrix_hdf, key = str_key_sub_matrix, chunksize = int_chunksize, 
#                                      where = '(Type_Prime == iter_type[0]) & (Sub_Type == iter_type[1]) & (Observation_Date > idx_obs_range[-174])'):
#        for iter_chunk in pd.read_hdf(str_path_sub_matrix_hdf, key = str_key_sub_matrix, chunksize = int_chunksize, 
#                                      where = '(Type_Prime == iter_type[0]) & (Sub_Type == iter_type[1]) & (Observation_Date < idx_obs_range[1900])'):             
            list_iter_container.append(iter_chunk)
            del iter_chunk
            gc.collect()
        print(str_global_name, ':', iter_type, ': Chunks loaded')
        ser_iter_matrix = pd.concat(list_iter_container, axis = 0).droplevel(['Type_Prime', 'Sub_Type']).sort_index()
        del list_iter_container
        gc.collect()        
        ser_iter_matrix.name = '/'.join([iter_type[0], iter_type[1]])
        print(str_global_name, ':', iter_type, ': Chunks aggregated')
        if bool_first_element:
            df_iter_matrix = ser_iter_matrix.to_frame()
            bool_first_element = False
        else:
            df_iter_matrix[ser_iter_matrix.name] = ser_iter_matrix.values
        del ser_iter_matrix
        gc.collect()                         
        print(str_global_name, ':', iter_type, ': Vector added to dataframe')
#    ### PCA first component extracting without correlation matrix Newey-West adjustment procedure:
#    ser_iter_pca = df_iter_matrix.groupby('Observation_Date').apply(single_date_pca, int_min_years_pca, list_ext_weights, False, list_dict_weights, list_dict_corrs)\
#                                                             .swaplevel().sort_index()
    ### PCA first component extracting with correlation matrix Newey-West adjustment procedure:
    ser_iter_pca = df_iter_matrix.groupby('Observation_Date').apply(single_date_pca, int_min_years_pca, list_ext_weights, True, list_dict_weights, list_dict_corrs)\
                                                             .swaplevel().sort_index()    
    del df_iter_matrix
    gc.collect()    
    print(str_global_name, ': PCA weighting performed')
    ### Saving weights collection to HDF:
    str_global_diag_key = dict_global_index_diag_key[iter_combo]
    pd.DataFrame(list_dict_weights[0]).transpose().to_hdf(str_path_bb_weights_hdf, key = str_global_diag_key, mode = 'a')   
    del list_dict_weights[0]
    gc.collect()    
    print(str_global_name, ': Weights collection added to the file')  
    ### Saving correlation matrices collection to HDF:
    str_global_diag_key = dict_global_index_diag_key[iter_combo]
    pd.DataFrame(list_dict_corrs[0]).transpose().stack().to_hdf(str_path_bb_corrs_hdf, key = str_global_diag_key, mode = 'a')   
    del list_dict_corrs[0]
    gc.collect()    
    print(str_global_name, ': Pairwise correlations collection added to the file')    
    ### Observation date vectors z-scoring:
    ser_iter_pca_z = ser_iter_pca.groupby('Observation_Date')\
                                 .transform(by_date_z_score, int_winsorize_bound, flo_winsorize_tolerance, int_winsorize_steps_limit, 
                                            int_min_years_z_score, int_max_years_z_score)
    ser_iter_pca_z.name = str_global_name
    ser_iter_pca_z.name = 'PCA_FPC_Z_Scored'    
    del ser_iter_pca
    gc.collect() 
    print(str_global_name, ': Index matrix z-scored')
    ### Adding aggregated data to the file:  
#    pd.concat([ser_iter_pca_z], keys = [str_global_name], names = ['Global_Index'])\
#                                    .to_hdf(str_path_global_matrix_z_hdf, key = str_key_global_matrix, format = 'table', complevel = 9, append = True, mode = 'a',
#                                            min_itemsize = {'Global_Index': int_max_global_name}) 
    ser_iter_pca_z.to_hdf(str_path_global_matrix_z_hdf, key = str_key_global_matrix, format = 'table', complevel = 9, append = True, mode = 'a')    
    del ser_iter_pca_z
    gc.collect()  
    print(str_global_name, ': Index matrix added to the file')

In [11]:
### RUN TO RE-EXPORT DATA: Z_SCORED PCA FPC DIAGONAL CONSTRUCTING (1 LINE TO MODIFY)

### Chunk size defining:
int_chunksize = 10 ** 6

for iter_combo in dict_global_index_name:
    gc.collect()
    ### Resulting matrix defining:
    str_path_global_matrix_z_hdf = dict_global_index_hdf[iter_combo]
    ### Global index name defining:
    str_global_name = dict_global_index_name[iter_combo]
    ### Global index diagonal key defining:
    str_global_diag_key = dict_global_index_diag_key[iter_combo]
    ### Iteration container preparing:
    list_iter_container = []
    ### Sub type matrix loading:
    print(str_global_name, ': Diagonal extraction started')
    for iter_chunk in pd.read_hdf(str_path_global_matrix_z_hdf, key = str_key_global_matrix, chunksize = int_chunksize, 
                                  where = 'Observation_Date >= date_diag_start'):
        list_iter_container.append(iter_chunk)
        del iter_chunk
        gc.collect()    
    ### Sub type matrix constructing:
    ser_iter_matrix = pd.concat(list_iter_container, axis = 0).sort_index()
    del list_iter_container
    if ('Global_Index' in ser_iter_matrix.index.names):
        ser_iter_matrix = ser_iter_matrix.droplevel('Global_Index')
    gc.collect()            
    print(str_global_name, ': Global index matrix loaded')   
    print(str_global_name, ': Main diagonal part constructing') 
    ### Extracting unique observation dates for each ticker later than diagonal start date:
    idx_date_list = ser_iter_matrix.index.get_level_values('Observation_Date').unique()
    ### Creating future diagonal vector:
    ser_iter_diag = ser_iter_matrix.groupby('Observation_Date')\
                                   .apply(lambda ser_obs_date: ser_obs_date.dropna().iloc[-1] if (ser_obs_date.count() > 0) else np.NaN)  ### LINE TO MODIFY
    ### Checking for data earlier than diagonal start date:
    if (ser_iter_matrix.dropna().loc[All, idx_date_list[0]].index[0] < date_diag_start):
        ### Selecting column to be turned:
        print(str_global_name, ': Auxiliary diagonal part constructing') 
        ser_iter_to_turn = ser_iter_matrix.loc[ : idx_date_list[0] - pd.offsets.Day(1), [idx_date_list[0]]].droplevel('Observation_Date')
        ### Joining series:
        ser_iter_diag = pd.concat([ser_iter_to_turn, ser_iter_diag], axis = 0).sort_index() 
    ### Reindexation to business daily vector and forward filling:
    ser_iter_diag = ser_iter_diag.ffill().reindex(idx_date_range).ffill()
    ser_iter_diag = ser_iter_diag.astype('float32')
    ser_iter_diag.name = str_global_name
    ser_iter_diag.index.names = ['Date']
    ### Saving resulting series to HDF:
    ser_iter_diag.to_hdf(str_path_bb_diag_hdf, key = str_global_diag_key, mode = 'a')
    print(str_global_name, ': Diagonal saved')     

Inflation Index (YoY) : Diagonal extraction started
Inflation Index (YoY) : Global index matrix loaded
Inflation Index (YoY) : Main diagonal part constructing
Inflation Index (YoY) : Auxiliary diagonal part constructing
Inflation Index (YoY) : Diagonal saved
Growth Index (YoY) : Diagonal extraction started
Growth Index (YoY) : Global index matrix loaded
Growth Index (YoY) : Main diagonal part constructing
Growth Index (YoY) : Auxiliary diagonal part constructing
Growth Index (YoY) : Diagonal saved


In [12]:
### RUN FOR TESTING: GLOBAL INDICES AND SUB TYPE DIAGONALS SAVING (NO CHANGES)

dict_global_diag = {}
for iter_combo in dict_global_index_name:   
    gc.collect()
    ### Global index name defining:
    str_global_name = dict_global_index_name[iter_combo]
    ### Global index elements extracting:
    if (isinstance(iter_combo, str)):
        iter_list = [iter_combo]
    else:
        iter_list = list(iter_combo)
    ### Global index diagonal key defining:
    str_global_diag_key = dict_global_index_diag_key[iter_combo]
    ### Adding diagonal to collection:
    dict_global_diag[str_global_name] = pd.read_hdf(str_path_bb_diag_hdf, key = str_global_diag_key)
### Diagonals collection saving:
pd.DataFrame(dict_global_diag).to_excel('Data_Files/Test_Files/2024/Global_Indices.xlsx', merge_cells = False) 

In [13]:
### RUN FOR TESTING: GLOBAL INDICES AND SUB TYPE DIAGONALS SAVING (NO CHANGES)

### All sub type diagonal loading:
list_sub_diag = []
for iter_str_key_diag_sub in [str_key_diag_sub_mom]:
    list_sub_diag.append(pd.concat([pd.read_hdf(str_path_bb_diag_hdf, key = iter_str_key_diag_sub)], keys = [iter_str_key_diag_sub[-3 : ]]))
### Diagonals data merging:
ser_sub_type_diag = pd.concat(list_sub_diag, axis = 0)
ser_sub_type_diag.index.set_names(names = 'Frequency', level = 0, inplace = True)
df_sub_type_diag = ser_sub_type_diag.unstack(['Type_Prime', 'Sub_Type', 'Frequency'])
df_sub_type_diag.columns = ['/'.join(iter_column) for iter_column in df_sub_type_diag.columns]
### Diagonals collection saving:
df_sub_type_diag.to_excel('Data_Files/Test_Files/2024/Sub_Type_Diagonals.xlsx', merge_cells = False) 

In [14]:
### RUN FOR TESTING: WEIGHTS SAVING (NO CHANGES)

dict_global_weights = {}
for iter_combo in dict_global_index_name:   
    gc.collect()
    ### Global index name defining:
    str_global_name = dict_global_index_name[iter_combo]
    ### Global index elements extracting:
    if (isinstance(iter_combo, str)):
        iter_list = [iter_combo]
    else:
        iter_list = list(iter_combo)
    ### Global index diagonal key defining:
    str_global_diag_key = dict_global_index_diag_key[iter_combo]
    ### Adding diagonal to collection:
    dict_global_weights[str_global_name] = pd.read_hdf(str_path_bb_weights_hdf, key = str_global_diag_key).resample('BM').asfreq()
### Weights collection saving:
with pd.ExcelWriter('Data_Files/Test_Files/2024/Global_Weights.xlsx') as xls_writer:
    for iter_name in dict_global_weights:
        str_sheet_name = iter_name.replace('Anticipated', 'Ant').replace('Employment', 'Emp').replace('Inflation', 'Inf')
        dict_global_weights[iter_name].to_excel(xls_writer, sheet_name = str_sheet_name, merge_cells = False) 

In [15]:
### RUN FOR TESTING: CORRELATION MATRICES SAVING (NO CHANGES)

dict_global_corrs = {}
for iter_combo in dict_global_index_name:   
    gc.collect()
    ### Global index name defining:
    str_global_name = dict_global_index_name[iter_combo]
    ### Global index elements extracting:
    if (isinstance(iter_combo, str)):
        iter_list = [iter_combo]
    else:
        iter_list = list(iter_combo)
    ### Global index diagonal key defining:
    str_global_diag_key = dict_global_index_diag_key[iter_combo]
    ### Adding diagonal to collection:
    dict_global_corrs[str_global_name] = pd.read_hdf(str_path_bb_corrs_hdf, key = str_global_diag_key)\
                                           .unstack(1).resample('BM').asfreq().stack(dropna = False).sort_index()
### Weights collection saving:
with pd.ExcelWriter('Data_Files/Test_Files/2024/Global_Corr_Matrices.xlsx') as xls_writer:
    for iter_name in dict_global_corrs:
        str_sheet_name = iter_name.replace('Anticipated', 'Ant').replace('Employment', 'Emp').replace('Inflation', 'Inf')
        dict_global_corrs[iter_name].to_excel(xls_writer, sheet_name = str_sheet_name, merge_cells = False)

In [18]:
### RUN TO RE-EXPORT DATA: LEVEL & MOMENTUM PERCENTILE CONSTRUCTING (NO CHANGES)

### Defining percentiles generator:
def take_ptiles(ser_date, int_ptile_months, int_ave_months, list_weight, int_lag):
    ### Cutting old values:
    date_obs = ser_date.index[0][1]
    ser_date = ser_date.droplevel('Observation_Date')
    ser_long = ser_date[ser_date.index[-1] - pd.DateOffset(months = int_ptile_months) : ]
    ser_short = ser_date[ser_date.index[-1] - pd.DateOffset(months = int_ave_months) : ]    
    ### Check for empty vector (doing nothing):
    if (ser_long.count() > 0):
        ### Level percentile calculation:
        flo_level_ptile = ser_long.rank(method = 'average', na_option = 'keep', ascending = True, pct = True).iloc[-1]
        ### Momentum percentile calculation start: change series
        ser_long = ser_long.resample('B').ffill()
        ser_long_change = (ser_long - ser_long.shift(int_lag)).replace(0.0, np.NaN)
        ### Short series of changes calculation:
        ser_short = ser_short.resample('B').ffill()
        ser_short_change = (ser_short - ser_short.shift()).replace(0.0, np.NaN)
        ### Weighted mean of changes calculation:
        ser_weight = pd.Series(list_weight[-len(ser_short_change.index) : ], ser_short_change.index)
        flo_ave_change = weighted_average(ser_short_change, ser_weight)
        ser_weight_dropped = ser_weight[ser_short_change.dropna().index]
        ser_weight_dropped = ser_weight_dropped / ser_weight_dropped.sum()
        flo_ave_change = flo_ave_change * math.sqrt(int_lag / (ser_weight_dropped ** 2).sum())
        ### Weighted mean rank defining:
        ser_long_change[-1] = flo_ave_change
        flo_mom_ptile = ser_long_change.rank(method = 'average', na_option = 'keep', ascending = True, pct = True).iloc[-1]
        ### Results output:
        return pd.DataFrame([[flo_level_ptile, flo_mom_ptile]], index = [date_obs])
    else:
        ### Results output:        
        return pd.DataFrame([[np.NaN, np.NaN]], index = [date_obs])

### Lagging step:
int_lag = 22 # 5 # 1 # 
### Weights array creating:
list_weight = list(map(lambda iter_num: exp_weight_single(int_halflife_months * 22, iter_num), range(int_ave_months * 22)))[::-1]
### Chunk size defining:
int_chunksize = 10 ** 6
### Previous HDF file deleting:
if os.path.isfile(str_path_bb_percentiles_hdf):
    os.remove(str_path_bb_percentiles_hdf)
### Looping over indices:
for iter_combo in dict_global_index_name:
    gc.collect()
    ### Resulting matrix defining:
    str_path_global_matrix_z_hdf = dict_global_index_hdf[iter_combo]
    ### Global index name defining:
    str_global_name = dict_global_index_name[iter_combo]
    ### Global index diagonal key defining:
    str_global_diag_key = dict_global_index_diag_key[iter_combo]
    ### Iteration container preparing:
    list_iter_container = []
    ### Sub type matrix loading:
    print(str_global_name, ': Percentiles calculation started')
    for iter_chunk in pd.read_hdf(str_path_global_matrix_z_hdf, key = str_key_global_matrix, chunksize = int_chunksize, 
                                  where = 'Observation_Date >= date_diag_start'):
        list_iter_container.append(iter_chunk)
        del iter_chunk
        gc.collect()    
    ### Sub type matrix constructing:
    ser_iter_matrix = pd.concat(list_iter_container, axis = 0).sort_index()
    del list_iter_container
    if ('Global_Index' in ser_iter_matrix.index.names):
        ser_iter_matrix = ser_iter_matrix.droplevel('Global_Index')
    gc.collect()            
    print(str_global_name, ': Global index matrix loaded')   
    ### Percentiles calculation procedure:
    df_iter_ptile = ser_iter_matrix.groupby('Observation_Date', group_keys = False).apply(take_ptiles, int_ptile_months, int_ave_months, list_weight, int_lag)
    df_iter_ptile.columns = ['Level_Percentile', 'Momentum_Percentile']
    print(str_global_name, ': Percentiles calculated')      
    ### Reindexation to business daily vector and forward filling:
    df_iter_ptile = df_iter_ptile.reindex(idx_date_range).ffill()
    df_iter_ptile = df_iter_ptile.astype('float32')
    df_iter_ptile.name = str_global_name
    df_iter_ptile.index.names = ['Date']
    ### Saving resulting series to HDF:
    df_iter_ptile.to_hdf(str_path_bb_percentiles_hdf, key = str_global_diag_key, mode = 'a')
    print(str_global_name, ': Percentiles saved')

Inflation Index (YoY) : Percentiles calculation started
Inflation Index (YoY) : Global index matrix loaded
Inflation Index (YoY) : Percentiles calculated
Inflation Index (YoY) : Percentiles saved
Growth Index (YoY) : Percentiles calculation started
Growth Index (YoY) : Global index matrix loaded
Growth Index (YoY) : Percentiles calculated
Growth Index (YoY) : Percentiles saved


In [19]:
### RUN TO RE-EXPORT DATA: LEVEL & MOMENTUM PERCENTILE SAVING (NO CHANGES)

dict_ptiles = {}
for iter_combo in dict_global_index_name:   
    gc.collect()
    ### Global index name defining:
    str_global_name = dict_global_index_name[iter_combo]
    ### Global index elements extracting:
    if (isinstance(iter_combo, str)):
        iter_list = [iter_combo]
    else:
        iter_list = list(iter_combo)
    ### Global index diagonal key defining:
    str_global_diag_key = dict_global_index_diag_key[iter_combo]
    ### Adding diagonal to collection:
    dict_ptiles[str_global_name] = pd.read_hdf(str_path_bb_percentiles_hdf, key = str_global_diag_key)
### Factors collection saving:
pd.concat(dict_ptiles, axis = 1).to_excel('Data_Files/Test_Files/2024/Global_Percentiles.xlsx', merge_cells = False) 

In [20]:
### RUN TO RE-EXPORT DATA: LEVEL & MOMENTUM PERCENTILE SIGN FACTORS GENERATING: RESEARCH VERSION (NO CHANGES)

### Constants defining:
tumbler_to_minus = 0.40
tumbler_to_plus = 0.60
### Percentiles initialization:
dict_ptiles = {}

### Looping over global indices:
for iter_combo in dict_global_index_name:   
    gc.collect()
    ### Global index name defining:
    str_global_name = dict_global_index_name[iter_combo]
    ### Global index elements extracting:
    if (isinstance(iter_combo, str)):
        iter_list = [iter_combo]
    else:
        iter_list = list(iter_combo)
    ### Global index diagonal key defining:
    str_global_diag_key = dict_global_index_diag_key[iter_combo]
    ### Percentiles table reading:
    df_iter_combo = pd.read_hdf(str_path_bb_percentiles_hdf, key = str_global_diag_key)
    ### Looping over percentile types inside each global index:
    for str_ptile_type in df_iter_combo.columns:
        print(str_global_name, '/', str_ptile_type, ': Sign factors calculation started')
        ### Container initialization:
        dict_signs = {}
        ### Percentile extraction:
        ser_iter_ptile = df_iter_combo[str_ptile_type]
        ### Looping over dateline:
        for sign_date in ser_iter_ptile.dropna().index[-1 :]:
#            print(str_global_name, '/', str_ptile_type, '/', sign_date, ': Sign defining')
            ### Initial sign defining:
            ser_iter_signs = pd.Series(np.NaN, index = ser_iter_ptile.dropna()[ : sign_date].index)
            ser_iter_signs.name = 'Sign'
            ser_iter_signs.iloc[0] = 1         
            ### Looping over part of the timeseries till the sign date to perform sign flipping:
            if (len(ser_iter_signs) > 1) :
                for iter_date in ser_iter_signs.index[1 : ]:
                    if (ser_iter_signs.loc[iter_date - pd.offsets.BusinessDay()] == 1):
                        if (ser_iter_ptile[iter_date] < tumbler_to_minus):
                            ser_iter_signs.loc[iter_date] = -1
                        else:
                            ser_iter_signs.loc[iter_date] = 1
                    else:
                        if (ser_iter_ptile[iter_date] > tumbler_to_plus):
                            ser_iter_signs.loc[iter_date] = 1
                        else:
                            ser_iter_signs.loc[iter_date] = -1
        print(str_global_name, '/', str_ptile_type, ': Sign factors calculation finished')           
        ### Signs vector collecting:
        dict_ptiles[str_global_name + ' / ' + str_ptile_type] = ser_iter_signs        
### Factors collection saving:
#pd.concat(dict_ptiles, axis = 1).to_hdf('Data_Files/Source_Files/Percentile_Signs_Research.hdf', key = 'percentile_signs', mode = 'a')
pd.concat(dict_ptiles, axis = 1).to_excel('Data_Files/Test_Files/2024/Percentile_Signs_Research.xlsx', merge_cells = False)

Inflation Index (YoY) / Level_Percentile : Sign factors calculation started
Inflation Index (YoY) / Level_Percentile : Sign factors calculation finished
Inflation Index (YoY) / Momentum_Percentile : Sign factors calculation started
Inflation Index (YoY) / Momentum_Percentile : Sign factors calculation finished
Growth Index (YoY) / Level_Percentile : Sign factors calculation started
Growth Index (YoY) / Level_Percentile : Sign factors calculation finished
Growth Index (YoY) / Momentum_Percentile : Sign factors calculation started
Growth Index (YoY) / Momentum_Percentile : Sign factors calculation finished
