In [1]:
### RUN EVERY TIME: ECONOMIC INDICES RELEASES HISTORY EXTRACTING

In [13]:
### RUN EVERY TIME: INITIALIZATION

import pandas as pd
import numpy as np
from datetime import date, datetime
from math import floor, ceil
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from statsmodels.tsa.x13 import x13_arima_analysis
from itertools import combinations_with_replacement
from sklearn.decomposition import PCA
import os
import gc
from joblib import Parallel, delayed

%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [3]:
### RUN ONLY WHEN ARIMA X13 SA LAUNCHING 

### Warnings hiding:
import warnings
warnings.filterwarnings('ignore')
### Seasonal adjustment module paths set up:
%env X13PATH = C:\Users\ighar\AppData\Roaming\jupyter\x13as
%env X12PATH = C:\Users\ighar\AppData\Roaming\jupyter\x13as
#%env

env: X13PATH=C:\Users\ighar\AppData\Roaming\jupyter\x13as
env: X12PATH=C:\Users\ighar\AppData\Roaming\jupyter\x13as


In [4]:
## VERSION CONTROL

from platform import python_version
print('pandas version: ', pd.__version__)
print('python version: ', python_version())

pandas version:  0.25.3
python version:  3.7.4


In [5]:
### RUN EVERY TIME: GENERAL DATA PREPARATION

### Constants:
All = slice(None)
list_na_excel_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null',
                        '#N/A Requesting Data...', '#N/A Invalid Security', '#N/A Field Not Applicable']
### Raw data path and sheets:
str_path_bb_idx_source = 'Data_Files/Source_Files/Bloomberg_Eco_Indices.xlsx'
str_us_sheet = 'US Eco Const'
str_all_sheet = 'All Eco Const'
### Flags data path and sheets:
str_path_bb_idx_flags = 'Data_Files/Source_Files/Bloomberg_Eco_Flags_Extended.xlsx'
str_flag_sheet = 'Bloomberg Description'
### Source data constants:
int_idx_cols = 12
### HDF file with converted source data:
str_path_bb_idx_hdf = 'Data_Files/Source_Files/Bloomberg_Eco_Indices.h5'
str_key_flags = 'flags_exported' ### Acadian flags list
str_key_exported = 'all_idx_exported' ### Raw export with only replacing zero dates and after 2021-01-01 dates with np.NaN
str_key_raw_filled = 'all_idx_raw_filled' ### Raw export with initial dates, dates gaps, absent date coluns filled
str_key_raw_history = 'raw_history' ### Export with all the corrections and fillings (restructured to [Index_Name -> Data_Date -> Observation_Date] | Index_Value series)
str_key_bday_history = 'bday_history' ### Raw history vector with observation dates moved to nearest future business dates
str_key_num_history = 'num_history' ### Bday history vector with observation dates changed to their date numbers (for future matrix cube saving as hdf file)
str_key_from_date = 'idx_from_date' ### Series to get date numbers from dates
str_key_to_date = 'idx_to_date' ### Series to get dates from date numbers
str_key_types_info = 'types_info' ### Dataframe with 'Type_Prime' / 'Sub_Type' / 'Region' groups descriptions
str_key_flags_typed = 'flags_typed' ### Dataframe with economic indices descriptions taking into account 
### HDF file with matrices:
str_path_bb_matrix_hdf = 'Data_Files/Source_Files/Matrix_Eco_Indices.h5'
str_key_matrix_z = 'matrix_cube_z_scored'
### HDF file with diagonals:
str_path_bb_diag_hdf = 'Data_Files/Source_Files/Diag_Eco_Indices.h5'
str_key_diag_daily_raw = 'matrix_diagonal_raw'
str_key_diag_daily_z = 'matrix_diagonal_z'
str_key_diag_group_z = 'groups_diagonal_z'
str_key_diag_sub_z = 'sub_types_diagonal_z'
str_key_diag_agg_z = 'aggregated_diagonal_z'
str_key_diag_mean_z = 'mean_diagonal_z'
### HDF file with group averages:
str_path_group_matrix_hdf = 'Data_Files/Source_Files/Matrix_Groups.h5'
str_key_group_matrix = 'matrix_cube_groups'
### HDF file with overall event dates as series index:
str_path_overall_dates_hdf = 'Data_Files/Source_Files/Overall_Dates.h5'
str_key_event_dates = 'overall_event_dates'
str_key_obs_dates = 'overall_obs_dates'
str_key_triangle_dates = 'overall_triangle_dates'
### HDF file with sub type averages:
str_path_sub_matrix_hdf = 'Data_Files/Source_Files/Matrix_Sub.h5'
str_key_sub_matrix = 'matrix_cube_subs'
### HDF file with global indices PCA FPC:
str_path_global_matrix_hdf = 'Data_Files/Source_Files/Matrix_Global.h5'
str_key_global_matrix = 'matrix_cube_globals'
### HDF file with global indices simple averages:
str_path_mean_matrix_hdf = 'Data_Files/Source_Files/Matrix_Mean.h5'
str_key_mean_matrix = 'matrix_cube_means'
### Observation axis range:
datetime_start = datetime(1984, 12, 31) # Start date for efficacy measures
date_start = datetime_start.date()
datetime_end = datetime(2020, 8, 31) # End date for efficacy measures
date_end = datetime_end.date()
idx_date_range = pd.date_range(date_start, date_end, freq = 'B')
datetime_basis = datetime(1993, 12, 31) # End date for efficacy measures
date_basis = datetime_basis.date()
### Gaps filling options:
int_revision_shift = 1
int_final_shift = 2
int_first_mean_length = 12
dict_final_only_lag = {}
dict_final_only_lag['Quarterly'] = 90 // 2
dict_final_only_lag['Monthly'] = 30 // 2
dict_final_only_lag['Other'] = 7 // 2
### Cumprod shifts for monthly data frequency:
dict_cumprod_step = {}
dict_cumprod_step['MoM%'] = 1
dict_cumprod_step['QoQ%'] = 3
dict_cumprod_step['YoY%'] = 12
### Stock-like series shifts for YoY transformation:
dict_yoy_shift = {}
dict_yoy_shift['Monthly'] = 12
dict_yoy_shift['Quarterly'] = 4
dict_yoy_shift['Other'] = 52
### Stock-like series shifts for MoM transformation:
dict_mom_shift = {}
dict_mom_shift['Monthly'] = 1
dict_mom_shift['Other'] = 4
### Group tickers rebasing options:
int_not_to_rebase_term = 7 ### Term in years for min ticker data date when we do not need to rebase it with basis group ticker
int_not_to_rebase_diff = 2 ### Minimal difference in years between basis ticker and other group ticker min date when we need to rebase group ticker
### Z-scoring options:
int_winsorize_bound = 4
flo_winsorize_tolerance = 0.0001
int_winsorize_steps_limit = 5
### Diagonal options:
int_min_years_z_score = 3
date_diag_start = datetime(1994, 1, 1)
### Data filling limit
int_fill_limit = 20
### Regions weights:
dict_region_weight = {}
dict_region_weight['US'] = 0.50
dict_region_weight['Europe'] = 0.25
dict_region_weight['Japan'] = 0.15
dict_region_weight['UK'] = 0.10
### Groups aggregation order:
list_agg_groups = [['EMP'], ['INF'], ['ANT', 'CON', 'OUT']]
dict_agg_groups = {}
dict_agg_groups['Employment'] = ['EMP']
dict_agg_groups['Inflation'] = ['INF']
dict_agg_groups['Economy'] = ['ANT', 'CON', 'OUT']
### A-la Newey-West adjustment maximum lag:
int_n_w_lag = 4
### Covariance subsamples number:
int_cov_samples = 22
### Minimal years to use column for PCA performing
int_min_years_pca = 7

In [6]:
### DEFINING OBSERVATION DATE VECTOR EXTRACTION

def get_obs_date_vector(str_ticker, str_date, bool_exact_date = False, bool_drop_levels = False):
    ### Vector for exact date:
    if bool_exact_date:
        ser_obs_date = pd.read_hdf(str_path_bb_matrix_hdf, key = str_key_matrix_z, where = 'Index_Name == str_ticker & Observation_Date == str_date')
    ### Vector for nearest date:        
    else:
        ### Loading full ticker series:        
        ser_z_scored = pd.read_hdf(str_path_bb_matrix_hdf, key = str_key_matrix_z, where = 'Index_Name == str_ticker & Observation_Date <= str_date')
        ### Extracting data for max date less or equal to needed date:
        ser_obs_date = ser_z_scored.loc[All, All, [ser_z_scored.index.levels[-1].max()]]
    ### Dropping constant index levels if needed:
    if bool_drop_levels:
        return ser_obs_date.droplevel(['Index_Name', 'Observation_Date'])
    else:
        return ser_obs_date

In [7]:
### DEFINING WEIGHTED AVERAGE FOR DATAFRAME COLUMNS (PART OF THE PRODUCT CODE)

def columns_average(df_series, list_weights = False): 
    ### Single column check
    if (len(df_series.columns) > 1):
        ### Equal weights list creating:
        if isinstance(list_weights, bool):
            list_weights = [1] * len(df_series.columns)
        ### Dataframe of weights initialising:
        df_weights = pd.DataFrame(np.NaN, index = df_series.index, columns = df_series.columns)
        for iter_num, iter_col in enumerate(df_weights.columns):
            df_weights[iter_col] = list_weights[iter_num]
        ### Zeroing weights for NaN values:
        for iter_col in df_weights.columns:
            df_weights.loc[df_series[iter_col].isna(), iter_col] = 0
        ser_mean = (df_series.multiply(df_weights).sum(axis = 1)).div(df_weights.sum(axis = 1))    
        ### Results output:
        del df_series
        del df_weights    
        gc.collect()
    else:
        ser_mean = df_series.squeeze()
        del df_series
        gc.collect()        
    return ser_mean

In [8]:
### RUN TO TEST: INDICES DATA CONVERTING TO STOCK-LIKE TO FURTHER SA TEST

### Defining Economic Index series transformation:
def complex_transform(ser_name, idx_date_range, df_flags, int_max_name_length, int_min_years, bool_perform_sa = False):
    ### Defining triangle extraction:
    def triangle_filter(ser_date):
        ### Extracting particular Data Date:
        date_diag = ser_date.index.get_level_values('Data_Date')[0]
        ### Dropping constant level:
        ser_result = ser_date.droplevel('Data_Date')
        ### Filtering over-diagonal values:
        ser_result = ser_result[ser_result.index >= date_diag] 
        ### Results output:
        return ser_result
    ### Year-over-year-percent ticker values transforming to stock-like series:
    def yoy_to_level(ser_date, int_step):
        ### Dropping constant level:
        ser_result = ser_date.droplevel('Observation_Date')
        ### Basis initiating:
        flo_basement = 1.0
        ### Factor initiating: 
        flo_next_brick  = 1.0
        ### Looping over month numbers:
        for iter_period in range(min(int_step, len(ser_result.index))):         
            ### Basement building up:
            flo_basement = flo_basement * flo_next_brick
            ### Next basement brick producing:
            flo_next_brick = ((flo_next_brick ** (iter_period)) * (ser_result.iloc[iter_period] ** (1 / int_step))) ** (1 / (iter_period + 1)) 
            ### Jumping cumulative product performing:
            idx_iter_data = ser_result.index[iter_period :: int_step]
            ser_result.loc[idx_iter_data] = ser_result.loc[idx_iter_data].cumprod() * flo_basement       
        ### Results output:            
        return ser_result    
    ### EI name extracting:
    str_index_name = ser_name.index.get_level_values(0)[0]
    ### Observation dates reindexation:    
    print(ser_name.index.get_level_values(0)[0], ': Reindexation')    
    idx_observation_range = ser_name.index.get_level_values('Observation_Date').unique().intersection(idx_date_range).sort_values()
    ser_full = ser_name.droplevel('Index_Name').unstack('Data_Date').reindex(idx_observation_range).stack('Data_Date', dropna = False).squeeze()
    ser_full = ser_full.swaplevel()
    ser_full.index.rename('Observation_Date', level = -1, inplace = True)    
    ### Forward filling for each data date:
    ser_full = ser_full.groupby('Data_Date').ffill()   
    ### Diagonalization:
    ser_triangle = ser_full.groupby('Data_Date').apply(triangle_filter).sort_index()
    ### Flags extracting:
    ser_flags = df_flags.loc[str_index_name, All].squeeze() 
    ### 'TAR' type checking:
    if (ser_flags['Type_Prime'] == 'TAR'):
        print(str_index_name, ': TAR Primary Type ignoring')        
        pass
    ### Flags-based transforming:
    else:
        ### Indices of NA values collecting:
        idx_isna = ser_triangle.loc[ser_triangle.isna()].index
        ### Transforming to stock-like series:
        if (ser_flags['Processing'] in ['Index', 'Level', 'Level%']):
            ser_stock = ser_triangle
        elif (ser_flags['Processing'] == 'Flow'):
            print(str_index_name, ': Transformation to stock-like series: Cumulative sum')
            ### Filling empty values:
            ser_triangle = ser_triangle.fillna(0)
            ### Cumulative sum for each observation date calculating:
            ser_stock = ser_triangle.groupby('Observation_Date').cumsum()
            ### Dropping NA values:
            ser_stock.loc[idx_isna] = np.NaN
        else:
            print(str_index_name, ': Transformation to stock-like series: Cumulative product')
            ### Filling empty values:
            ser_triangle = ser_triangle.fillna(0)
            ### Percents to multipliers converting:
            ser_stock = 1 + ser_triangle / 100
            ### Calculating with needed periodicity:
            if (ser_flags['Frequency'] == 'Monthly'):
                int_step = dict_cumprod_step[ser_flags['Processing']]
                ### Year-by-year cumprod with rebasing:
                ser_stock = ser_stock.groupby('Observation_Date').apply(yoy_to_level, int_step).swaplevel().sort_index()
            ### Dropping NA values:
            ser_stock.loc[idx_isna] = np.NaN
    ### Results output:
    return pd.concat([ser_stock], keys = [str_index_name], names = ['Index_Name'])

### Flags loading:
df_flags_typed = pd.read_hdf(str_path_bb_idx_hdf, key = str_key_flags_typed)
### Economic Indices vector loading:
ser_history_bday = pd.read_hdf(str_path_bb_idx_hdf, key = str_key_bday_history)
ser_history_bday = ser_history_bday.reindex(df_flags_typed.index, level = 'Index_Name')
### Testing:
import tables
tables.file._open_files.close_all()
#list_test_ticker = ['JCOMHCF Index']
list_test_ticker = ['NAPMPMI Index', 'CHPMINDX Index', 'DFEDGBA Index', 'EMPRGBCI Index', 'MAPMINDX Index', 'NAPMNMI Index', 'OUTFGAF Index', 'RCHSINDX Index']
ser_history_bday_test = ser_history_bday.loc[list_test_ticker, All, All]
ser_test_stock = ser_history_bday_test.groupby('Index_Name', group_keys = False)\
                                     .apply(complex_transform, idx_date_range, df_flags_typed, 0, int_min_years_z_score, bool_perform_sa = False)

NAPMPMI Index : Reindexation
CHPMINDX Index : Reindexation
DFEDGBA Index : Reindexation
EMPRGBCI Index : Reindexation
MAPMINDX Index : Reindexation
NAPMNMI Index : Reindexation
OUTFGAF Index : Reindexation
RCHSINDX Index : Reindexation


In [102]:
### RUN TO TEST: PERFORMING X13-SA WITH STOCK-LIKE CONVERTED DATA

### Defining Economic Index series transformation:
def complex_transform(ser_stock, idx_date_range, df_flags, int_max_name_length, int_min_years, bool_perform_sa = False):
    ### X13 ARIMA Seasonality adjustment model:
    def perform_x13_sa(ser_date):
        ### Dropping constant level:        
        ser_result = ser_date.droplevel('Observation_Date')
        ### Check for nom empty vector:
        if (ser_result.count() > 0):
            ### Check for minimal quantity of observations to perform seasonality adjustment:
            if (ser_result.last_valid_index() - ser_result.first_valid_index()).days >= (int_min_years * 365):   
                ### Naming series for x13 performing:
                ser_result.name = 'Ticker'
                ### Calculating shift value to make all series positive:
                flo_positron = abs(ser_result.min()) * 2
                try:
                    ### Performing seasonality adjustment:
                    ser_result = x13_arima_analysis(ser_result + flo_positron, outlier = True, trading = True).seasadj - flo_positron
    #                print('SA success : ', ser_date.index.get_level_values('Observation_Date')[0])                 
                except:
                    print('SA error : ', ser_date.index.get_level_values('Observation_Date')[0])
                    pass
        ### Results output:                
        return ser_result 
    ### EI name extracting:
    str_index_name = ser_stock.index.get_level_values(0)[0]
    ser_stock = ser_stock.droplevel('Index_Name')
    ### Flags extracting:
    ser_flags = df_flags.loc[str_index_name, All].squeeze() 
    ### 'TAR' type checking:
    if (ser_flags['Type_Prime'] == 'TAR'):
        print(str_index_name, ': TAR Primary Type ignoring')        
        pass
    ### Flags-based transforming:
    else:
        ### Indices of NA values collecting:
        idx_isna = ser_stock.loc[ser_stock.isna()].index
        ### Seasonality adjustment testing:
        if (bool_perform_sa & (ser_flags['SA_Status'].strip(' ') != 'SA')):
            print(str_index_name, ': Seasonality adjustment')            
            ### Filling empty values:            
            ser_stock = ser_stock.groupby('Observation_Date').ffill()
            ser_stock = ser_stock.groupby('Observation_Date').transform(perform_x13_sa).swaplevel().sort_index()
            ### Dropping NA values:
            ser_stock.loc[idx_isna] = np.NaN    
    return pd.concat([ser_stock], keys = [str_index_name], names = ['Index_Name'])
### Flags loading:
df_flags_typed = pd.read_hdf(str_path_bb_idx_hdf, key = str_key_flags_typed)
### Testing:
import tables
tables.file._open_files.close_all()
#ser_test_x13_sa = ser_test_stock.groupby('Index_Name', group_keys = False)\
#                                     .apply(complex_transform, idx_date_range, df_flags_typed, 0, int_min_years_z_score, bool_perform_sa = True)
#%timeit ser_test_stock.loc[['DFEDGBA Index'], All, ['2020-07-31','2020-08-31']].groupby('Index_Name', group_keys = False)\
#                                                          .transform(complex_transform, idx_date_range, df_flags_typed, 0, int_min_years_z_score, bool_perform_sa = True)

DFEDGBA Index : Seasonality adjustment
DFEDGBA Index : Seasonality adjustment
DFEDGBA Index : Seasonality adjustment
DFEDGBA Index : Seasonality adjustment
DFEDGBA Index : Seasonality adjustment
DFEDGBA Index : Seasonality adjustment
DFEDGBA Index : Seasonality adjustment
DFEDGBA Index : Seasonality adjustment
1.8 s ± 21.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [89]:
### TEMP

gc.collect()

3241

In [10]:
### TEMP

idx_obs_range = ser_test_stock.loc[['DFEDGBA Index'], All, All].index.get_level_values('Observation_Date').unique()
ser_test_single = ser_test_stock.loc[['DFEDGBA Index'], All, idx_obs_range[-10 : ]].droplevel('Index_Name')
#ser_test_single = ser_test_stock.loc[['DFEDGBA Index'], All, All].droplevel('Index_Name')
ser_test_single.name = 'Value'
ser_test_single

Data_Date   Observation_Date
2004-06-30  2020-03-31          47.8
            2020-04-27          47.8
            2020-04-30          47.8
            2020-05-26          47.8
            2020-06-01          47.8
                                ... 
2020-06-30  2020-07-27          -6.1
            2020-07-31          -6.1
            2020-08-31          -6.1
2020-07-31  2020-07-31          -3.0
            2020-08-31          -3.0
Name: Value, Length: 1920, dtype: float64

In [24]:
### TEMP

def perform_x13_sa(ser_date):
    ### Dropping constant level:        
    ser_result = ser_date.droplevel('Observation_Date')
    ### Check for nom empty vector:
    if (ser_result.count() > 0):
        ### Check for minimal quantity of observations to perform seasonality adjustment:
        if (ser_result.last_valid_index() - ser_result.first_valid_index()).days >= (int_min_years * 365):   
            ### Naming series for x13 performing:
            ser_result.name = 'Ticker'
            ### Calculating shift value to make all series positive:
            flo_positron = abs(ser_result.min()) * 2
            try:
                ### Performing seasonality adjustment:
                ser_result = x13_arima_analysis(ser_result + flo_positron, outlier = True, trading = True).seasadj - flo_positron
#                print('SA success : ', ser_date.index.get_level_values('Observation_Date')[0])                 
            except Exception as e:
                print('SA error : ', ser_date.index.get_level_values('Observation_Date')[0], '/', e.__class__)
                pass
    ### Results output:                
    return pd.concat([ser_result], keys = [ser_date.index.get_level_values('Observation_Date')[0]], names = ['Observation_Date']) 

def transformParallel(serGrouped, func):
    retLst = Parallel(n_jobs = 4)(delayed(func)(group) for name, group in serGrouped)    
    return pd.concat(retLst)

int_min_years = 7

#print('regular version: ')
#%timeit ser_test_single.groupby('Observation_Date').transform(perform_x13_sa).sort_index(level = ['Observation_Date', 'Data_Date'])

#print('parallel version: ')
#%timeit transformParallel(ser_test_single.groupby('Observation_Date'), perform_x13_sa).sort_index(level = ['Observation_Date', 'Data_Date'])

In [None]:
### TEMP

ser_test_single.groupby('Observation_Date').transform(perform_x13_sa).sort_index(level = ['Observation_Date', 'Data_Date'])

In [25]:
### TEMP

transformParallel(ser_test_single.groupby('Observation_Date'), perform_x13_sa).sort_index(level = ['Observation_Date', 'Data_Date']).swaplevel()

Data_Date   Observation_Date
2004-06-30  2020-03-31          47.358938
2004-07-31  2020-03-31          39.502098
2004-08-31  2020-03-31          36.851518
2004-09-30  2020-03-31          30.178766
2004-10-31  2020-03-31          28.379187
                                  ...    
2020-03-31  2020-08-31         -67.926482
2020-04-30  2020-08-31         -67.407643
2020-05-31  2020-08-31         -49.261507
2020-06-30  2020-08-31          -7.672658
2020-07-31  2020-08-31          -4.089245
Name: seasadj, Length: 1920, dtype: float64

In [36]:
### TEMP

ser_test_single

Data_Date   Observation_Date
2004-06-30  2020-07-31          47.8
            2020-08-31          47.8
2004-07-31  2020-07-31          39.2
            2020-08-31          39.2
2004-08-31  2020-07-31          37.3
                                ... 
2020-05-31  2020-08-31         -49.2
2020-06-30  2020-07-31          -6.1
            2020-08-31          -6.1
2020-07-31  2020-07-31          -3.0
            2020-08-31          -3.0
Name: Value, Length: 388, dtype: float64