In [1]:
#### MRI GENERATOR

In [2]:
#### STANDART MODULES INITIALISING

### Importing standard modules and date-special modules:
import numpy as np
import pandas as pd
import math
from datetime import date

In [3]:
#### MRI CONSTANTS AND PARAMETERS SETTING

### Standart date format for notebook:
date_format = '%Y-%m-%d'
### MRI dates:
date_first = date(1990, 1, 1)
date_last = date(2018, 12, 31)
date_start = date(1993, 12, 31)
### Source xlsx file attributes:
path_mri_data_xlsx = 'Data_Files/Source_Files/mri_data.xlsx'
mri_model_name = 'Model 01'
### HDF5 file with structured source data for selected date interval attributes:
path_mri_data_hdf = 'Data_Files/Source_Files/mri_data.h5'
key_mri_data_hdf = 'source_data'

### Limitations for rolling windows for z-score calculating:
asset_window_min = 252
asset_window_max = 252 * 100
mri_window_min = 252
mri_window_max = 260 * 10
### Limitations for z-score winsorizing:
arr_winsor_boundary = [-4, 4]
### Limitations for moving average for MRI calculation:
mri_moving_average_window_max = 5
### HDF5 with MRI group matrices builded from z-scored means of standartized winsorized weighted z-score matrices for each group asset:
path_mri_standart_hdf = 'Data_Files/Source_Files/mri_group_z_matrix.h5'
### HDF5 with MRI asset level info:
path_mri_assets_hdf = 'Data_Files/Source_Files/mri_released_assets.h5'
object_selected_data_hdf = 'selected_data'
object_standartized_data_hdf = 'standartized_data'
### HDF5 with MRI group level info:
path_mri_groups_hdf = 'Data_Files/Source_Files/mri_released_groups.h5'
object_diag_grouped_hdf = 'diag_grouped_data'
object_perc_grouped_hdf = 'percentile_grouped_data'
### HDF5 with MRI level info:
path_mri_index_hdf = 'Data_Files/Source_Files/mri_released_index.h5'
object_diag_mri_hdf = 'diag_MRI_data'
object_raw_perc_mri_hdf = 'raw_perc_MRI_data'
object_ma_perc_mri_hdf = 'ma_perc_MRI_data'

In [4]:
### MRI DATA AGGREGATING FUNCTION
def get_mri_data(source_file_path, source_model_sheet, hdf_file_path, hdf_object_key, date_index, update_hdf = True):
    ### Importing standard modules:    
    import numpy as np
    import pandas as pd    
    
    ### Reading Model information from Source model sheet:
    df_model_raw = pd.read_excel(source_file_path, sheet_name = source_model_sheet, header = 1, usecols = [0, 1, 2, 3, 4, 5, 6])
    ### Group border rows deleting:
    df_model_raw = df_model_raw[df_model_raw['Asset Group'] != df_model_raw['Asset Code']]   
    ### Dividing list on asset part and MRI weights part:
    df_model_asset = df_model_raw[df_model_raw['Asset Group'] != 'MRI'] ### Asset part
    df_model_asset.reset_index(drop = True, inplace = True)
    df_model_mri = df_model_raw[df_model_raw['Asset Group'] == 'MRI'] ### MRI part
    df_model_mri.reset_index(drop = True, inplace = True) 
    ### Extracting source data from initial excel file or from saved hdf
    if (update_hdf): 
        ### Aggregating data from the source xlsx file to pd.DataFrame:
        arr_tab_data = []
        for iter_index, iter_row in df_model_asset.iterrows():
            iter_tab = iter_row['Asset Tab Name']
            iter_asset = iter_row['Asset Code']
            ser_iter_tab = pd.read_excel(source_file_path, sheet_name = iter_tab, header = 0, index_col = 0, squeeze = True)
            ser_iter_tab.name = iter_asset
            arr_tab_data.append(ser_iter_tab)
        df_source_data = pd.concat(arr_tab_data, axis = 1, join = 'outer')
        df_source_data = df_source_data.astype(float)        
        df_source_data.to_hdf(hdf_file_path, hdf_object_key, mode = 'w', format = 'fixed', append = False)
    else:
        df_source_data = pd.read_hdf(hdf_file_path, hdf_object_key)
    ### Filtering by date_index and forward filling missing values:
    df_source_data.fillna(method = 'ffill', inplace = True)
    df_selected_data = df_source_data.reindex(date_index, method = 'ffill')
    df_selected_data.index.name = 'Date'
    return [df_model_asset, df_model_mri, df_selected_data]

In [5]:
### GETTING MRI DATA FOR FUTURE CALCULATIONS
index_mri_date = pd.date_range(date_first, date_last, freq = 'B')
#[df_model_asset, df_model_mri, df_selected_data] = get_mri_data(path_mri_data_xlsx, mri_model_name, path_mri_data_hdf, key_mri_data_hdf, 
#                                                                 index_mri_date, update_hdf = True)
[df_model_asset, df_model_mri, df_selected_data] = get_mri_data(path_mri_data_xlsx, mri_model_name, path_mri_data_hdf, key_mri_data_hdf, 
                                                                index_mri_date, update_hdf = False)

In [6]:
def get_rolling_z_score(ser_source, min_wnd, max_wnd, winsor_bottom, winsor_top):
    ### Importing standard modules:    
    import numpy as np
    import pandas as pd
    
    ### Calculating rolling mean:
    ser_rolling_mean = ser_source.rolling(window = max_wnd, min_periods = min_wnd, win_type = None).mean()
    ### Calculating rolling standard deviation:
    ser_rolling_std = ser_source.rolling(window = max_wnd, min_periods = min_wnd, win_type = None).std()
    ### Calculating rolling z-score:
    ser_rolling_z_score = (ser_source - ser_rolling_mean) / ser_rolling_std
    ### Initializing resulting variables:
    df_z_matrix = pd.DataFrame(np.NaN, index = ser_source.index, columns = ser_source.index)
    df_z_matrix = df_z_matrix.astype(float)
    ### Calculating z-score matrix:
    for iter_end_index in ser_source.index:
        ### Isolating rolling window for particular data vector element:
        iter_start_index = iter_end_index - pd.offsets.BusinessDay(max_wnd)
        ser_iter_source = ser_source.loc[iter_start_index : iter_end_index]
        ### Checking for at list min_wnd elements of rolling window are not np.NaN:
        if (ser_iter_source.count() >= min_wnd):
            ser_iter_z_score = (ser_iter_source - ser_iter_source.mean()) / ser_iter_source.std()            
            ### Winsorization process:
            bool_to_winsor = True            
            while (bool_to_winsor):       
                ### Value based winsorization:
                ser_iter_z_score.clip(lower = winsor_bottom, upper = winsor_top, inplace = True)
                ### Recalculating of z scores:
                ser_iter_z_score = (ser_iter_z_score - ser_iter_z_score.mean()) / ser_iter_z_score.std()                
                ### Checking for boundaries:
                if (ser_iter_z_score[(ser_iter_z_score < winsor_bottom) & (ser_iter_z_score > winsor_top)].size == 0):
                    bool_to_winsor = False
            ### Filling z matrix column part after the winsorizing (if needed):
            df_z_matrix.loc[iter_start_index : iter_end_index, iter_end_index] = ser_iter_z_score.values        
    ### Getting winsorized z meanings:     
    ser_rolling_z_winsor = pd.Series(list(np.diag(df_z_matrix)), index = ser_source.index)
    ### Backfilling with first not NaN column of z matrix:
    ind_valid_index = ser_rolling_z_winsor.first_valid_index()
    ser_rolling_z_winsor.loc[ : ind_valid_index] = df_z_matrix.loc[ : ind_valid_index, ind_valid_index]
    
    return [ser_rolling_z_score, ser_rolling_z_winsor, df_z_matrix]

In [7]:
def get_standartized_mri_data(df_model_asset, df_selected_data, date_start, asset_window_min, asset_window_max, arr_winsor_boundary, hdf_file_path):
    ### Importing standard modules:    
    import numpy as np
    import pandas as pd
    from datetime import datetime

    ### Base assets determination (resorting by earliest value):
    df_model_asset['Asset Date'] = date_first
    for (iter_index, asset_code) in df_model_asset['Asset Code'].iteritems():
        df_model_asset.loc[iter_index, 'Asset Date'] = df_selected_data[asset_code].dropna().index.min() 
    df_model_asset.sort_values(['Asset Group', 'Asset Date'], inplace = True)
    df_model_asset = df_model_asset.reset_index(drop = True)

    ### Initialising loop visibility variables:          
#    arr_group_diag_container = []
    dict_group_diag_container = {} ### Group z-matrices diagonales container
#    arr_group_vector_container = []
#    arr_asset_vector_container = []
    dict_asset_vector_container = {} ### Asset z-matrices diagonales container
#    arr_asset_codes_global = []
#    arr_group_codes = []
    ### Standartizing loop on group level:
    for asset_group_name, df_asset_group in df_model_asset.groupby('Asset Group'):
        ### Initialising group visibility variables:
        print('get_standartized_mri_data: group', asset_group_name, 'standartizing started')
        bool_base_asset = True
#        arr_asset_matrix_container = []
        dict_asset_matrix_container = {} ### Asset matrices collection for group mean matrix calculation
#        arr_asset_codes = []
        ### Standartizing cycle on asset level with the group:
        for (asset_index, asset_code) in df_asset_group['Asset Code'].iteritems():
            ### Assignment of base asset data set:
            if (bool_base_asset):
                bool_base_asset = False
                ### Performing z scoring for base asset:
                [ser_rolling_z_score_base, ser_rolling_z_winsor_base, df_base_z_matrix] = get_rolling_z_score(df_selected_data[asset_code], 
                                                                                                              asset_window_min, asset_window_max,
                                                                                                              arr_winsor_boundary[0], arr_winsor_boundary[1])
                ### Calculating etalon filled quantity before date_start:
                int_base_filled = ser_rolling_z_winsor_base[ : date_start].dropna().count()                
                ### Defining of standartized values of base asset as diagonal of z matrix (without backfilling):
#                arr_asset_vector_container.append(pd.Series(np.copy(np.diag(df_base_z_matrix)), index = df_base_z_matrix.index))
                dict_asset_vector_container[asset_code] = pd.Series(list(np.diag(df_base_z_matrix)), index = df_base_z_matrix.index)
#                ### Initialising dataset with non np.NaN wages sum for group:
#                df_group_weights = pd.DataFrame(np.zeros(df_base_z_matrix.shape), index = df_base_z_matrix.index, columns = df_base_z_matrix.columns)
                ### Creating a whole group dataset with multiplying asset matrix to asset weight:
#                arr_asset_matrix_container.append(df_base_z_matrix * df_model_asset.at[asset_index, 'Factor Weights'])    
                dict_asset_matrix_container[asset_code] = df_base_z_matrix
#                df_group_weights = df_group_weights + df_base_z_matrix.notna() * df_model_asset.at[asset_index, 'Factor Weights']
#                arr_asset_codes.append(asset_code)
#                arr_asset_codes_global.append(asset_code)
            ### Normalization of other asset's data sets:                
            else:
                ### Performing z scoring for asset:                
                [ser_asset_z_score_simple, ser_asset_z_score_winsor, df_asset_z_matrix] = get_rolling_z_score(df_selected_data[asset_code], 
                                                                                                              asset_window_min, asset_window_max, 
                                                                                                              arr_winsor_boundary[0], arr_winsor_boundary[1])            
                ### Calculating asset filled quantity:                
                int_asset_filled = ser_asset_z_score_winsor[ : date_start].dropna().count()            
                ### Standartizing asset if they do not have enough initial values:
                if (int_asset_filled < int_base_filled * 2 / 3):
#                    df_asset_start_index = ser_asset_z_score_simple.index.get_loc(ser_asset_z_score_simple.first_valid_index())
                    index_asset_start = ser_asset_z_score_simple.first_valid_index()
                    ### RenormaLizing asset z matrix with base z matrix data:
#                    for end_wnd_index in range(index_asset_start, min(index_asset_start + asset_window_max, ser_asset_z_score_simple.size)):
                    for index_asset_end in ser_asset_z_score_simple.index:
                        if ((index_asset_end >= index_asset_start) & (index_asset_end <= (index_asset_start + pd.offsets.BusinessDay(asset_window_max)))):
#                        ser_base_z_matrix_part = df_base_z_matrix.iloc[max(0, index_asset_start - asset_window_min + 1) : index_asset_end + 1, index_asset_end]
                            ser_base_z_part = df_base_z_matrix.loc[index_asset_start - pd.offsets.BusinessDay(asset_window_min) : index_asset_end, index_asset_end]
                            df_asset_z_matrix.loc[:, index_asset_end] = df_asset_z_matrix.loc[:, index_asset_end] * ser_base_z_part.std()  + ser_base_z_part.mean()
                       
                ### Defining of standartized values of asset as diagonale of modified z matrix (without backfilling):
#                arr_asset_vector_container.append(pd.Series(np.copy(np.diag(df_asset_z_matrix)), index = df_asset_z_matrix.index))   
                dict_asset_vector_container[asset_code] = pd.Series(list(np.diag(df_asset_z_matrix)), index = df_asset_z_matrix.index)
                ### Adding asset matrix to a whole group dataset with multiplying asset matrix to asset weight:          
#                arr_asset_matrix_container.append(df_asset_z_matrix * df_model_asset.at[asset_index, 'Factor Weights'])  
                dict_asset_matrix_container[asset_code] = df_asset_z_matrix
#                df_group_weights = df_group_weights + df_asset_z_matrix.notna() * df_model_asset.at[asset_index, 'Factor Weights']                    
#                arr_asset_codes.append(asset_code)   
#                arr_asset_codes_global.append(asset_code)  
            print('get_standartized_mri_data: asset', asset_code, 'in group', asset_group_name, 'standartized successfully')         
        ### Calculating z matrix for group from weighted asset matrices:
#        df_group_mean = pd.concat(arr_asset_matrix_container, axis = 0, keys = arr_asset_codes, names = ['Asset Code', 'Date'], copy = False)   
        df_group_mean = pd.concat(dict_asset_matrix_container, axis = 0, names = ['Asset Code', 'Date'], copy = False)
#        print('get_standartized_mri_data: aggregated matrix for group' , asset_group_name, 'builded successfully')    
        df_group_mean = df_group_mean.groupby('Date').mean()    
#        df_group_mean = df_group_mean.sum(level = 1)
#        df_group_mean[df_group_weights > 0] =  df_group_mean[df_group_weights > 0] / df_group_weights[df_group_weights > 0]
#        df_group_mean[df_group_weights == 0] = np.NaN
#        print('get_standartized_mri_data: mean matrix for group' , asset_group_name, 'builded successfully')         
        df_group_mean_z = (df_group_mean - df_group_mean.mean()) / df_group_mean.std()
        ### Adding diagonale of group weighted mean z-score matrix to MRI dataset:
#        arr_group_diag_container.append(pd.Series(np.copy(np.diag(df_group_mean_z)), index = df_group_mean_z.index))
        dict_group_diag_container[asset_group_name] = pd.Series(list(np.diag(df_group_mean_z)), index = df_group_mean_z.index)        
        print('get_standartized_mri_data: z-score matrix for group' , asset_group_name, 'mean matrix builded successfully') 
        ### Saving group matrix to hdf file for further manipulations:
#        df_group_to_save = df_group_mean_z.copy()
#        df_group_to_save = df_group_to_save.astype(float)
#        df_group_to_save.reset_index(inplace = True)
#        df_group_to_save.columns = np.arange(len(df_group_to_save.columns))
#        df_group_to_save.to_hdf(hdf_file_path, key = asset_group_name, mode = 'a', format = 'fixed')
#        arr_group_codes.append(asset_group_name)
#        df_group_mean_z = df_group_mean_z.astype(float)
        df_group_mean_z.stack(dropna = False).to_hdf(hdf_file_path, key = asset_group_name, mode = 'a', format = 'fixed')
        print('get_standartized_mri_data: z-score matrix for group' , asset_group_name, 'saved to HDF5 file', hdf_file_path, '(object key:', asset_group_name, ')')
#        ### Creating data vector of percentiled group's z matrix columns for each group:
#        ser_group_z_percentile = pd.Series(np.NaN, index = df_group_mean_z.index) 
#        ser_group_z_percentile.name = asset_group_name
#        for column_index in df_group_mean_z.columns:
#            if (column_index >= datetime.strptime(date_start, date_format)):                
#                ser_rolling_wnd = df_group_mean_z.loc[(column_index - pd.DateOffset(years = 1) + pd.DateOffset(days = 1)) : column_index, column_index]
#                ser_group_z_percentile[column_index] = ((ser_rolling_wnd.rank(method = 'min')[-1] - 1) / ser_rolling_wnd.notna().sum() + 
#                        ser_rolling_wnd.rank(pct = True, method = 'max')[-1]) / 2                    
#        arr_group_vector_container.append(ser_group_z_percentile)
    ### Collection of standartized z-scores for all assets:
#    df_asset_standartized = pd.concat(arr_asset_vector_container, axis = 0, keys = arr_asset_codes_global, names = ['Asset Code', 'Date'], copy = False)
    ser_asset_standartized = pd.concat(dict_asset_vector_container, axis = 0, names = ['Asset Code', 'Date'], copy = False)    
    ser_asset_standartized = ser_asset_standartized.astype(float)
    print('get_standartized_mri_data: asset standartized z-score collection builded successfully')
    ### Collection of diagonales of group's z matrices for all groups:
#    df_group_mean_z_diag = pd.concat(arr_group_diag_container, axis = 0, keys = arr_group_codes, names = ['Group Name', 'Date'], copy = False)
    ser_group_mean_z_diag = pd.concat(dict_group_diag_container, axis = 0, names = ['Group Name', 'Date'], copy = False)    
    print('get_standartized_mri_data: data vector collection of diagonales of mean z score matrix for all groups builded successfully') 
    ser_group_mean_z_diag = ser_group_mean_z_diag.astype(float)    
#    ### Collection of percentiled group's z matrices for all groups:
#    df_group_percentiled = pd.concat(arr_group_vector_container, axis = 0, keys = arr_group_codes, names = ['Group Name', 'Date'], copy = False)
#    print('get_standartized_mri_data: percentiled data vector collection on base of mean z score matrix for all groups builded successfully')         
    
#    return [df_asset_standartized, df_group_mean_z_diag, df_group_percentiled]      
    return [ser_asset_standartized, ser_group_mean_z_diag] 

In [8]:
#### STANDARTISING SOURCE DATA FOR MRI CALCUCATION

### Standartizing dataset:
### Building collection of standartized winsorized z-scores for all assets:
### Building collection of group's z matrices diagonales for all groups:
### Saving group's z matrices:
[ser_standartized_assets, ser_diag_mean_z_groups] = get_standartized_mri_data(df_model_asset, df_selected_data, date_start, 
                                                                              asset_window_min, asset_window_max, arr_winsor_boundary, 
                                                                              path_mri_standart_hdf)
### Saving results for assets to HDF5 to avoid hard calculations with constant source model and datasets:
ser_standartized_assets.to_hdf(path_mri_assets_hdf, key = object_standartized_data_hdf, mode = 'w', format = 'fixed')
### Saving results for groups to HDF5 to avoid hard calculations with constant source model and datasets:
ser_diag_mean_z_groups.to_hdf(path_mri_groups_hdf, key = object_diag_grouped_hdf, mode = 'w', format = 'fixed')

get_standartized_mri_data: group EQ standartizing started
get_standartized_mri_data: asset iv_us in group EQ standartized successfully
get_standartized_mri_data: asset iv_eu in group EQ standartized successfully
get_standartized_mri_data: asset iv_uk in group EQ standartized successfully
get_standartized_mri_data: asset iv_jp in group EQ standartized successfully
get_standartized_mri_data: asset iv_rvx in group EQ standartized successfully
get_standartized_mri_data: asset iv_eem in group EQ standartized successfully
get_standartized_mri_data: z-score matrix for group EQ mean matrix builded successfully
get_standartized_mri_data: z-score matrix for group EQ saved to HDF5 file Data_Files/Source_Files/mri_group_z_matrix.h5 (object key: EQ )
get_standartized_mri_data: group FI standartizing started
get_standartized_mri_data: asset oas_hy in group FI standartized successfully
get_standartized_mri_data: asset oas_em in group FI standartized successfully
get_standartized_mri_data: z-score mat

In [14]:
### Saving results for assets to HDF5 to avoid hard calculations with constant source model and datasets:
ser_standartized_assets.to_hdf(path_mri_assets_hdf, key = object_standartized_data_hdf, mode = 'w', format = 'fixed')
### Saving results for groups to HDF5 to avoid hard calculations with constant source model and datasets:
ser_diag_mean_z_groups.to_hdf(path_mri_groups_hdf, key = object_diag_grouped_hdf, mode = 'w', format = 'fixed')

In [None]:
##########################################################################################################################################################################