In [1]:
### CI FACTOR DAILY GENERATOR

In [2]:
### MODULES IMPORT

import pandas as pd
import numpy as np
from datetime import date, datetime
import os ### To work with csv files - should be ignored in product code

### To test parallel calculations:
from joblib import Parallel, delayed
### To profile only:
%load_ext line_profiler
import gc 

In [3]:
## VERSION CONTROL

from platform import python_version
print('pandas version: ', pd.__version__)
print('python version: ', python_version())

pandas version:  0.25.3
python version:  3.7.4


In [4]:
### INTERNAL PARAMETERS INITIALIZATION (TO BE IGNORED IN PRODUCT CODE)

### Common constants:
All = slice(None)

### Results saving:
str_ci_var1_raw_csv = 'Data_Files/Test_Files/ci_var1_raw.csv'
str_ci_rbq_raw_csv = 'Data_Files/Test_Files/ci_rbq_raw.csv'
str_ci_beta_nn_raw_csv = 'Data_Files/Test_Files/ci_beta_nn_raw.csv'
### General daily-mode ranges parameters:
str_date_source_start = '2002-01-01' ### Start date for ISON Universe
str_date_factor_start = '2004-12-31' ### End date for efficacy measures
str_date_factor_end = '2021-11-30' ### End date for efficacy measures
idx_test_monthly_range = pd.date_range(str_date_factor_start, str_date_factor_end, freq = 'BM') ### Range for source data filtering
idx_test_daily_range = pd.date_range(str_date_factor_start, str_date_factor_end, freq = 'B') ### Range for source data filtering
### Sample data source:
str_path_returns = 'Data_Files/Source_Files/msci_sample.xlsx'
str_sheet_returns = 'Test Returns'

In [5]:
### DEFINING WEIGHTED AVERAGE

def weighted_average(ser_data, ser_weight = False, int_min_count = 0):
    ### Default output:
    num_result = np.NaN
    ### Checking for data presence:
    if (ser_data.count() > int_min_count):       
        ### Checking for weights dataset:
        if isinstance(ser_weight, bool):
            ### Calculating of simple average:
            num_result = np.nanmean(ser_data.values)
        else:
            ### Weights filtering:
            list_weight = ser_weight[ser_data.dropna().index].values
            ### Checking for weights presence:
            if np.nansum(list_weight):
                ### Data filtering:
                list_data = ser_data.dropna().values
                ### Weighted average calculating:
                num_result = np.nansum(list_data * list_weight) / np.nansum(list_weight)
    ### Results output:
    return num_result

In [6]:
### DEFINING MULTI-STEP STANDARTIZATION FOR SEPARATE SERIES

def multistep_standartize(ser_data_source, arr_truncate, ser_weight = False, reuse_outliers = False, center_result = True, full_result = False):  
    ### Arrays of iterations properties:
    arr_mean = []
    arr_std = []
    ### Adding equal weights, when weights are absent:
    if isinstance(ser_weight, bool):
        ser_weight = pd.Series(1, index = ser_data_source.index)
        ser_weight.name = 'Weight'    
    ### Workhorse and resulting data vectors initialising:
    ser_data_iter = ser_data_source.dropna()
    ser_weight_iter = ser_weight.copy()
    ser_data_full = pd.Series(np.NaN, index = ser_data_iter.index)
    ### Looping by boundaries array:
    for num_bound_iter in arr_truncate:
        ### Properties calculating and saving:
        num_mean_iter = weighted_average(ser_data_iter, ser_weight_iter)
        num_std_iter = ser_data_iter.std()
        arr_mean.append(num_mean_iter)
        arr_std.append(num_std_iter)
        ser_data_iter = (ser_data_iter - num_mean_iter) / num_std_iter       
        ### Standartizing:
        if reuse_outliers:
            ser_data_iter[ser_data_iter.abs() >= num_bound_iter] = np.sign(ser_data_iter) * num_bound_iter 
        else:
            ### Saving to result and excluding from further calculations truncated values:             
            ser_data_full.where(ser_data_iter.abs() < num_bound_iter, np.sign(ser_data_iter) * num_bound_iter, inplace = True)      
            ser_data_iter = ser_data_iter[ser_data_iter.abs() < num_bound_iter]           
    ### Aggregating result:
    if (reuse_outliers):
        ser_data_full = ser_data_iter
    else:     
        ser_data_full[ser_data_iter.index] = ser_data_iter
    ### Centering result:
    if (center_result):
        ser_result = ser_data_full - weighted_average(ser_data_full, ser_weight) 
    else:
        ser_result = ser_data_full    
    ### Result output:
    ser_result.name = str(ser_data_source.name) + '_standartized'
    if (full_result):
        return (ser_result, arr_mean, arr_std)
    else:
        return ser_result

In [7]:
### DEFINING MATLAB STYLE PRCTILE

def prctile_matlab(ser_to_perc, p):
    ### Sorted list preparing:
    list_sorted = sorted(ser_to_perc.dropna().values)
    ### Length calculating:
    num_len = len(list_sorted)    
    ### Prctile calculating:
    num_result = np.interp(np.array(p), np.linspace(1 / (2 * num_len), (2 * num_len - 1) / (2 * num_len), num_len), list_sorted)
    ### Results output:
    return num_result

In [8]:
### DEFINING LOG BASED MOVING AVERAGE FUNCTION

def log_based_ma(ser_source_raw, int_roll_max, int_roll_min):
    ### Drop level to avoid stack/unstack manipulations:
    if (len(ser_source_raw.index.names) == 2):
        ser_source_raw = ser_source_raw.droplevel(-1)
    ### Log bsed moving average calculating:
#    ser_source_res = ser_source_raw.rolling(window = int_roll_max, min_periods = int_roll_min)\
#                                   .apply(lambda ser_roll_win: np.log(1 + ser_roll_win).mean(), raw = False) ### REPLACED !!!!!!!!!!!!!!!!   
    ser_source_res = ser_source_raw.rolling(window = int_roll_max, min_periods = int_roll_min)\
                                   .apply(lambda arr_roll_win: np.nanmean(np.log(1 + arr_roll_win)), raw = True) ### ADDED !!!!!!!!!!!!!!!!
    return ser_source_res

In [9]:
### DEFINING STANDARTIZE -> RECOVER -> NEGATIVE RETURNS CLIP TRANSFORMATION

def get_clean_returns(ser_source_raw, idx_source_range):
#def get_country_vector(ser_source_raw):    
    ### Drop level to avoid stack/unstack manipulations:
    if (len(ser_source_raw.index.names) == 2):       
        ser_source_raw = ser_source_raw.droplevel(-1)        
    ### Source vector cleaning:
    ser_source_raw = ser_source_raw.replace([np.inf, -np.inf], np.NaN).replace(0.0, np.NaN)
    ### Source vector standartizing:
    ser_source_std, list_mean, list_std = multistep_standartize(ser_source_raw, [7.5, 6.0], full_result = True)
    ### Source vector recovering:
    ser_source_rec = (ser_source_std * list_std[1] + list_mean[1]) * list_std[0] + list_mean[0]

#    ### Source vector standartizing:
#    ser_source_std, list_mean, list_std = multistep_standartize(ser_source_raw, [7.5], full_result = True)
#    ### Source vector recovering:
#    ser_source_rec = ser_source_std * list_std[0] + list_mean[0]

#    ser_source_rec = ser_source_raw + 0

    ser_source_rec.loc[ser_source_rec < -1.0] = -0.9
    ### Source vector to rates:
    ser_source_rate = (1 + ser_source_rec.fillna(0.0)).cumprod()
    ### Rates vector reindexing:
#    idx_source_range = pd.date_range(start = ser_source_rec.index[0], end = ser_source_rec.index[-1], freq = 'B') ### DROPPED !!!!!!!!!!!!!!!!!!
    ser_source_rate = ser_source_rate.reindex(idx_source_range)
    ### Rates vector forward filling:
    ser_source_rate = ser_source_rate.ffill()
    ### Rates vector to returns:
    ser_source_res = ser_source_rate.diff() / ser_source_rate.shift()
    ser_source_res = ser_source_res.replace(0.0, np.NaN)
    ### Result output:    
    ser_source_res.index.names = ['Date']
    ser_source_res.name = 'Returns'
    return ser_source_res

def get_clean_returns_to_parallel(ser_source_raw, idx_source_range):
#def get_country_vector(ser_source_raw):    
    ### Drop level to avoid stack/unstack manipulations:
    bool_country = False
    if (len(ser_source_raw.index.names) == 2):
        bool_country = True
        str_country = ser_source_raw.index[0][-1]
        ser_source_raw = ser_source_raw.droplevel(-1)
        
    ### Source vector cleaning:
    ser_source_raw = ser_source_raw.replace([np.inf, -np.inf], np.NaN).replace(0.0, np.NaN)
    ### Source vector standartizing:
    ser_source_std, list_mean, list_std = multistep_standartize(ser_source_raw, [7.5, 6.0], full_result = True)
    ### Source vector recovering:
    ser_source_rec = (ser_source_std * list_std[1] + list_mean[1]) * list_std[0] + list_mean[0]

#    ### Source vector standartizing:
#    ser_source_std, list_mean, list_std = multistep_standartize(ser_source_raw, [7.5], full_result = True)
#    ### Source vector recovering:
#    ser_source_rec = ser_source_std * list_std[0] + list_mean[0]

#    ser_source_rec = ser_source_raw + 0

    ser_source_rec.loc[ser_source_rec < -1.0] = -0.9
    ### Source vector to rates:
    ser_source_rate = (1 + ser_source_rec.fillna(0.0)).cumprod()
    ### Rates vector reindexing:
#    idx_source_range = pd.date_range(start = ser_source_rec.index[0], end = ser_source_rec.index[-1], freq = 'B') ### DROPPED !!!!!!!!!!!!!!!!!!
    ser_source_rate = ser_source_rate.reindex(idx_source_range)
    ### Rates vector forward filling:
    ser_source_rate = ser_source_rate.ffill()
    ### Rates vector to returns:
    ser_source_res = ser_source_rate.diff() / ser_source_rate.shift()
    ser_source_res = ser_source_res.replace(0.0, np.NaN)
    ### Result output:    
    ser_source_res.index.names = ['Date']
    ser_source_res.name = 'Returns'
    if bool_country:
        ser_source_res = pd.concat({str_country: ser_source_res}, names = ['Country'])  
    return ser_source_res

In [10]:
### DEFINING PARALLEL PRCESSING DECORATORS

def transformParallel(serGrouped, func):
    retLst = Parallel(n_jobs = 4)(delayed(func)(group) for name, group in serGrouped)    
    return pd.concat(retLst)

def transformParallel_with_params(serGrouped, func, *args):
    retLst = Parallel(n_jobs = 4)(delayed(func)(group, *args) for name, group in serGrouped)    
    return pd.concat(retLst)

In [11]:
### DEFINING VALUE-AT-RISK FACTOR CREATING FUNCTION

def get_var1_factor(iter_date):
    ### Factor parameters:
    int_stand_win = 260 * 5
    int_factor_win = 260 * 1
    flo_pctile = 0.01
    ### Source load parameters:
    date_stand_start = iter_date - pd.tseries.offsets.BDay(int_stand_win - 1)
    date_factor_start = iter_date - pd.tseries.offsets.BDay(int_factor_win - 1)
    ### Datasource for particular date (should be substituted by SQL query):
    ser_iter_asset_raw = ser_country_ret_raw.loc[date_stand_start : iter_date, All]
    idx_iter_range = pd.date_range(start = date_stand_start, end = iter_date, freq = 'B')
    ### Data source transforming:
#    ser_iter_asset_trans = ser_iter_asset_raw.groupby('Country').apply(get_clean_returns, idx_iter_range).swaplevel().sort_index()
    ser_iter_asset_trans = transformParallel_with_params(ser_iter_asset_raw.groupby('Country'), get_clean_returns_to_parallel, idx_iter_range).swaplevel().sort_index()  
    ### Last year extracting:
    ser_iter_asset_cut = ser_iter_asset_trans.loc[date_factor_start : iter_date, All]
    ### Value-at-risk factor calculation (with minimum observations number check):
    ser_iter_factor = ser_iter_asset_cut.groupby('Country').apply(lambda ser_country: prctile_matlab(ser_country, flo_pctile) \
                                                                  if (ser_country.count() > (int_factor_win // 2)) else np.NaN)
    ### Add to csv file (should be substituted by SQL query):
    ser_iter_factor_csv = pd.concat({iter_date: ser_iter_factor}, names = ['Date'])
    ser_iter_factor_csv.to_csv(str_ci_var1_raw_csv, mode = 'a', header = not os.path.exists(str_ci_var1_raw_csv), sep = ';')
    ### Results output:
    return ser_iter_factor

In [12]:
### DEFINING RETURN BASED QUALITY FACTOR CREATING FUNCTION

def get_rbq_factor(iter_date):
    ### Factor parameters:
    int_stand_win = 260 * 5
    int_factor_win = 260 * 1
    int_roll_max = 22
    int_roll_min = 10    
    ### Source load parameters:
    date_stand_start = iter_date - pd.tseries.offsets.BDay(int_stand_win - 1)
    date_factor_start = iter_date - pd.tseries.offsets.BDay(int_factor_win - 1)
    date_ma_start = iter_date - pd.tseries.offsets.BDay(int_factor_win + int_roll_max - 1)
    ### Datasource for particular date (should be substituted by SQL query):
    ser_iter_asset_raw = ser_country_ret_raw.loc[date_stand_start : iter_date, All]
    ser_iter_bench_raw = ser_bench_ret_raw.loc[date_stand_start : iter_date]
    idx_iter_range = pd.date_range(start = date_stand_start, end = iter_date, freq = 'B')
    ### Data source transforming:
#    ser_iter_asset_trans = ser_iter_asset_raw.groupby('Country').apply(get_clean_returns, idx_iter_range).swaplevel().sort_index()
    ser_iter_asset_trans = transformParallel_with_params(ser_iter_asset_raw.groupby('Country'), get_clean_returns_to_parallel, idx_iter_range).swaplevel().sort_index()    
    ser_iter_bench_trans = get_clean_returns(ser_iter_bench_raw, idx_iter_range)
    ### Moving average calculating:
    ser_iter_asset_ma = ser_iter_asset_trans.loc[date_ma_start : iter_date, All].groupby('Country').apply(log_based_ma, int_roll_max, int_roll_min).swaplevel().sort_index()
    ser_iter_bench_ma = log_based_ma(ser_iter_bench_trans[date_ma_start : iter_date], int_roll_max, int_roll_min)
    ### Last year extracting:
    ser_iter_asset_ma_cut = ser_iter_asset_ma.loc[date_factor_start : iter_date, All]
    ser_iter_bench_ma_cut = ser_iter_bench_ma.loc[date_factor_start : iter_date]    
    ### Benchmark quality factor calculation (with minimum observations number check):
    date_idxmin = ser_iter_bench_ma_cut[::-1].idxmin()
    ser_iter_factor = ser_iter_asset_ma_cut.groupby('Country')\
                                    .apply(lambda ser_country: ser_country.droplevel('Country')[date_idxmin] if (ser_country.count() > (int_factor_win // 2)) else np.NaN)
    ### Add to csv file (should be substituted by SQL query):
    ser_iter_factor_csv = pd.concat({iter_date: ser_iter_factor}, names = ['Date'])
    ser_iter_factor_csv.to_csv(str_ci_rbq_raw_csv, mode = 'a', header = not os.path.exists(str_ci_rbq_raw_csv), sep = ';')
    ### Results output:
    return ser_iter_factor

In [13]:
### DEFINING NEGATIVE/NEGATIVE BETA FACTOR CREATING FUNCTION

def get_beta_nn_factor(iter_date):
    ### Factor parameters:
    int_stand_win = 260 * 5
    int_factor_win = 260 * 1
    ### Source load parameters:
    date_stand_start = iter_date - pd.tseries.offsets.BDay(int_stand_win - 1)
    date_factor_start = iter_date - pd.tseries.offsets.BDay(int_factor_win - 1)
    ### Datasource for particular date (should be substituted by SQL query):
    ser_iter_asset_raw = ser_country_ret_raw.loc[date_stand_start : iter_date, All]
    ser_iter_bench_raw = ser_bench_ret_raw.loc[date_stand_start : iter_date]
    idx_iter_range = pd.date_range(start = date_stand_start, end = iter_date, freq = 'B')
    ### Data source transforming:
#    ser_iter_asset_trans = ser_iter_asset_raw.groupby('Country').apply(get_clean_returns, idx_iter_range).swaplevel().sort_index()
    ser_iter_asset_trans = transformParallel_with_params(ser_iter_asset_raw.groupby('Country'), get_clean_returns_to_parallel, idx_iter_range).swaplevel().sort_index()    
    ser_iter_bench_trans = get_clean_returns(ser_iter_bench_raw, idx_iter_range)
    ### Last year extracting:
    ser_iter_asset_cut = ser_iter_asset_trans.loc[date_factor_start : iter_date, All]
    ser_iter_bench_cut = ser_iter_bench_trans.loc[date_factor_start : iter_date]    
    ### Positive returns to zero:
    ser_iter_asset_cut_nn = ser_iter_asset_cut.copy()
    ser_iter_asset_cut_nn.loc[ser_iter_asset_cut_nn > 0.0] = 0.0
    ser_iter_bench_cut_nn = ser_iter_bench_cut.copy()
    ser_iter_bench_cut_nn.loc[ser_iter_bench_cut_nn > 0.0] = 0.0
    ### Negative/Negative beta factor calculation (with minimum observations number check):    
    ser_iter_factor = -ser_iter_asset_cut_nn.groupby('Country')\
                                            .apply(lambda ser_country: (ser_country.droplevel('Country') * ser_iter_bench_cut_nn).sum() / (ser_iter_bench_cut ** 2).sum() \
                                                   if (ser_country.count() > (int_factor_win // 2)) else np.NaN)
    ### Add to csv file (should be substituted by SQL query):
    ser_iter_factor_csv = pd.concat({iter_date: ser_iter_factor}, names = ['Date'])
    ser_iter_factor_csv.to_csv(str_ci_beta_nn_raw_csv, mode = 'a', header = not os.path.exists(str_ci_beta_nn_raw_csv), sep = ';')    
    ### Results output:
    return ser_iter_factor

In [14]:
### DATA LOADING

### Sample returns data loading:
df_sample_ret = pd.read_excel(io = str_path_returns, sheet_name = str_sheet_returns, header = 0, parse_dates = True, index_col = [0],
                                 na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                              '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
ser_bench_ret_raw = df_sample_ret['BENCH']
ser_country_ret_raw = df_sample_ret.iloc[:, :-1].stack().sort_index()
ser_country_ret_raw.index.set_names('Country', 1, inplace = True)

In [15]:
### PROFILING

df_ret_raw = df_sample_ret.iloc[:, :-1]
for _ in range(199):
    df_ret_raw = pd.concat([df_ret_raw, df_sample_ret.iloc[:, :-1]], axis = 1)
df_ret_raw.columns = list(range(len(df_ret_raw.columns)))
ser_country_ret_raw = df_ret_raw.stack().sort_index()
ser_country_ret_raw.index.set_names('Country', 1, inplace = True)
print('Countries number: ', len(df_ret_raw.columns))

Countries number:  1000


In [None]:
### TESTING: PERFORMING FACTOR FOR DATE RANGE

### Removing csv files before loop running:
if (os.path.exists(str_ci_var1_raw_csv)):
    os.remove(str_ci_var1_raw_csv)
if (os.path.exists(str_ci_rbq_raw_csv)):
    os.remove(str_ci_rbq_raw_csv)  
if (os.path.exists(str_ci_beta_nn_raw_csv)):
    os.remove(str_ci_beta_nn_raw_csv)    
### Local testing parameters:
int_interval = 10 ### Interval of progress displaying
date_start = datetime.utcnow() ### Start time of calculations
date_control = datetime.utcnow() ### Control time to display
idx_test_date_range = idx_test_daily_range[ : 30] # idx_test_monthly_range # idx_test_daily_range # 
### Test performing:
print('Start time:', date_start)
for iter_num, iter_date in enumerate(idx_test_date_range):
    ### Progress printing:
    if not (divmod(iter_num, int_interval)[1]):
        if iter_num:
            print('Counter marker:', iter_num, '/', len(idx_test_date_range))
            timedelta_interval = datetime.utcnow() - date_control
            print('Time interval since last marker:', datetime.utcnow() - date_control)            
            print('Average interval for single date:', str(timedelta_interval / int_interval))
        date_control = datetime.utcnow()
        
    ### Value-at-risk factor calculating:
    ser_iter_var1_factor = get_var1_factor(iter_date)  

    ### Return based quality factor calculating:
    ser_iter_rbq_factor = get_rbq_factor(iter_date)  

    ### Negative/negative beta factor calculating:
    ser_iter_beta_nn_factor = get_beta_nn_factor(iter_date)  

date_finish = datetime.utcnow()
### Overall statistics printing:
print('Finish time:', date_finish)
print('Full interval:', date_finish - date_start)
print('Average interval for single date:', str((date_finish - date_start) / len(idx_test_date_range)))

In [16]:
### PROFILING

def profile_it():
    ### Removing csv files before loop running:
    if (os.path.exists(str_ci_var1_raw_csv)):
        os.remove(str_ci_var1_raw_csv)
    if (os.path.exists(str_ci_rbq_raw_csv)):
        os.remove(str_ci_rbq_raw_csv)  
    if (os.path.exists(str_ci_beta_nn_raw_csv)):
        os.remove(str_ci_beta_nn_raw_csv)    
    ### Local testing parameters:
    int_interval = 10 ### Interval of progress displaying
    date_start = datetime.utcnow() ### Start time of calculations
    date_control = datetime.utcnow() ### Control time to display
    idx_test_date_range = idx_test_daily_range[-2 : ] # idx_test_monthly_range # idx_test_daily_range # 
    ### Test performing:
    print('Start time:', date_start)
    for iter_num, iter_date in enumerate(idx_test_date_range):
        ### Progress printing:
        if not (divmod(iter_num, int_interval)[1]):
            if iter_num:
                print('Counter marker:', iter_num, '/', len(idx_test_date_range))
                timedelta_interval = datetime.utcnow() - date_control
                print('Time interval since last marker:', datetime.utcnow() - date_control)            
                print('Average interval for single date:', str(timedelta_interval / int_interval))
            date_control = datetime.utcnow()

        ### Value-at-risk factor calculating:
        ser_iter_var1_factor = get_var1_factor(iter_date)  

        ### Return based quality factor calculating:
        ser_iter_rbq_factor = get_rbq_factor(iter_date)  

        ### Negative/negative beta factor calculating:
        ser_iter_beta_nn_factor = get_beta_nn_factor(iter_date)  

    date_finish = datetime.utcnow()
    ### Overall statistics printing:
    print('Finish time:', date_finish)
    print('Full interval:', date_finish - date_start)
    print('Average interval for single date:', str((date_finish - date_start) / len(idx_test_date_range)))

In [17]:
### PROFILING

gc.collect()

%lprun -f profile_it profile_it()
#%lprun -f get_var1_factor profile_it()
#%lprun -f get_rbq_factor profile_it()
#%lprun -f get_beta_nn_factor profile_it()
#%lprun -f get_country_vector profile_it()
#%lprun -f log_based_ma profile_it()
#%lprun -f multistep_standartize profile_it()

Start time: 2022-01-25 09:41:33.358270
Finish time: 2022-01-25 09:43:11.543175
Full interval: 0:01:38.184905
Average interval for single date: 0:00:49.092452


Timer unit: 1e-07 s

Total time: 98.1886 s
File: <ipython-input-16-7c12830ba866>
Function: profile_it at line 3

Line #      Hits         Time  Per Hit   % Time  Line Contents
     3                                           def profile_it():
     4                                               ### Removing csv files before loop running:
     5         1       1699.0   1699.0      0.0      if (os.path.exists(str_ci_var1_raw_csv)):
     6         1       1767.0   1767.0      0.0          os.remove(str_ci_var1_raw_csv)
     7         1        885.0    885.0      0.0      if (os.path.exists(str_ci_rbq_raw_csv)):
     8         1       1121.0   1121.0      0.0          os.remove(str_ci_rbq_raw_csv)  
     9         1       1093.0   1093.0      0.0      if (os.path.exists(str_ci_beta_nn_raw_csv)):
    10         1       1206.0   1206.0      0.0          os.remove(str_ci_beta_nn_raw_csv)    
    11                                               ### Local testing parameters:
    12         1  

In [47]:
### WAYS COMPARING

### Parameter initializing:
int_factor_win = 260
### Dummy data loading:
df_sample_ret = pd.read_excel(io = str_path_returns, sheet_name = str_sheet_returns, header = 0, parse_dates = True, index_col = [0],
                                 na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                              '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
ser_iter_asset_ma_cut = df_sample_ret.iloc[:, :-1].stack().sort_index()
ser_iter_asset_ma_cut.index.set_names('Country', 1, inplace = True)
ser_iter_asset_ma_cut.name = 'Returns'
### Dummy bench vector creating:
ser_iter_bench_idxmin = pd.Series(pd.date_range(start = '2010-12-31', periods = 5 ,freq = 'BY'), index = ser_iter_asset_ma_cut.index.levels[-1])
ser_iter_bench_idxmin.index.names = ['Country']
ser_iter_bench_idxmin.name = 'Date'

In [17]:
### WAYS COMPARING

### YOUR WAY:
def way_01(ser_iter_asset_ma_cut, ser_iter_bench_idxmin):
    ser_result = ser_iter_asset_ma_cut.groupby('Country')\
                                    .apply(lambda x: x.droplevel('Country').loc[ser_iter_bench_idxmin[x.index[0][1]]] if (x.count() > (int_factor_win // 2)) else np.NaN)
    return ser_result

### FACTOR CALCULATION BY REINDEXATION WITHOUT GROUPING, BUT NO COUNT CHECK:
def way_02(ser_iter_asset_ma_cut, ser_iter_bench_idxmin):
    df_iter_bench_idxmin = ser_iter_bench_idxmin.to_frame()
    df_iter_bench_idxmin['Dummy'] = 1
    df_iter_bench_idxmin = df_iter_bench_idxmin.set_index('Date', append = True).swaplevel()
    ser_result = ser_iter_asset_ma_cut.reindex(df_iter_bench_idxmin.index).droplevel('Date')
    return ser_result

### FACTOR CALCULATION BY MERGE WITHOUT GROUPING, BUT NO COUNT CHECK:
def way_03(ser_iter_asset_ma_cut, ser_iter_bench_idxmin):
    df_result = ser_iter_bench_idxmin.to_frame().reset_index().merge(ser_iter_asset_ma_cut.to_frame().reset_index('Date'), how = 'left', on = ['Country', 'Date'])
    ser_result = df_result.set_index('Country')['Returns']
    return ser_result

print('Way 01 results:\n', way_01(ser_iter_asset_ma_cut, ser_iter_bench_idxmin))
%timeit way_01(ser_iter_asset_ma_cut, ser_iter_bench_idxmin)
print('Way 02 results:\n', way_02(ser_iter_asset_ma_cut, ser_iter_bench_idxmin))
%timeit way_02(ser_iter_asset_ma_cut, ser_iter_bench_idxmin)
print('Way 03 results:\n', way_03(ser_iter_asset_ma_cut, ser_iter_bench_idxmin))
%timeit way_03(ser_iter_asset_ma_cut, ser_iter_bench_idxmin)

Way 01 results:
 Country
AU    0.002081
DE    0.013574
GB    0.001541
JP   -0.000666
US   -0.010168
Name: Returns, dtype: float64
4.07 ms ± 218 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Way 02 results:
 Country
AU    0.002081
DE    0.013574
GB    0.001541
JP   -0.000666
US   -0.010168
Name: Returns, dtype: float64
2.49 ms ± 165 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Way 03 results:
 Country
AU    0.002081
DE    0.013574
GB    0.001541
JP   -0.000666
US   -0.010168
Name: Returns, dtype: float64
7.26 ms ± 216 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
