In [66]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [67]:
from factor_design_distributed_ver1 import utils, factor_design, factor_backtest
import pandas as pd
import numpy as np
from tqdm import tqdm

# load all relevant resources 

In [68]:
# play around the data 
df_train = pd.read_csv('./research_train_set.csv')

# We built a index map to speed up the factor design, you don't need to worry about this.
col2index_map = utils.load_json('./factor_design_distributed_ver1/col2index_map.json')
# df_train in a dictionary format
df_train_dic_sorted = utils.load_json('./factor_design_distributed_ver1/df_train_dic_sorted.json')

In [71]:
existed_factors = utils.add_factor_to_existed() # this function will add all temp files to the existed factor, then return the existed factor. usually take 1 min

matching files: ['./factor_design_distributed_ver1\\tmp20231216.json']
Factor price_pressure is a new factor, adding the value
./factor_design_distributed_ver1\tmp20231216.json added to existed_factors
storing the updated existed_factors
existed_factors updated, all temporary files deleted


In [72]:
existed_factors.keys()

dict_keys(['origin_seconds_in_bucket', 'origin_imbalance_size', 'origin_imbalance_buy_sell_flag', 'origin_reference_price', 'origin_matched_size', 'origin_far_price', 'origin_near_price', 'origin_bid_price', 'origin_bid_size', 'origin_ask_price', 'origin_ask_size', 'origin_wap', 's1_imbalance_signed_pow_opt', 'stock_20s_ret_deviate_from_index', 'stock_40s_ret_deviate_from_index_rank_hedged_opt', 'triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged', 'price_pressure'])

In [46]:
factor_performance = utils.load_json('./factor_design_distributed_ver1/factor_performance.json')

# Factor Design

In [47]:
df_train.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


In [48]:
def micro_price(current_data: dict,hist_list=[])->np.ndarray:
    """same logic as s1_imbalance, but only use matched volume"""
    current_data = np.array(list(current_data.values()),
                            dtype=float).T  # this is faster than pd.DataFrame(current_data).values
    # print(current_data.shape)
    res= (current_data[:,col2index_map['bid_price']]*current_data[:,col2index_map['ask_size']]+current_data[:,col2index_map['ask_price']]*current_data[:,col2index_map['bid_size']])/(current_data[:,col2index_map['ask_size']]+current_data[:,col2index_map['bid_size']])
    res[np.isnan(res)] = 0  # this is the fastest way to fill nan
    return -res  # The return value MUSE BE a numpy array, with no NaN value

In [49]:
def bid_ask_size(current_data: dict, hist_list=[]) -> np.ndarray:
    '''
    This will be the main function to design your factors for the competition. Please
    define only one factor here each time. We provide you with:

    Current_data: a dictionary in the format of {column_name: column_value}, where column_name is from the original
    dataframe

    hist_list: A list for you to save the previous factor values (optional). For instance,
    if you are calculating a 100-day Moving Average (MA), then you can save the first calculated
    MA in hist_list, and then for the next MA calculation, you can use the saved ones.
    '''
    ###################### ADD YOUR CODE HERE FOR FACTORS DESIGN ######################
    # convert the current_data to your choice of numpy or pandas dataframe
    # current_data = pd.DataFrame(current_data)
    current_data = np.array(list(current_data.values()),
                            dtype=float).T  # this is faster than pd.DataFrame(current_data).values
    res = (current_data[:, col2index_map['bid_size']] - current_data[:, col2index_map['ask_size']])
    ############################## NAN/Inf handling ######################################
    # if you have nan in your factor value, please fill it reasonably
    # res = np.nan_to_num(res) # this is slow because it also checks for inf.
    # res = np.where(np.isnan(res), 0, res)  # this is slightly faster than np.nan_to_num
    res[np.isnan(res)] = 0  # this is the fastest way to fill nan
    ############################## END OF YOUR CODE ##############################
    return res  # The return value MUSE BE a numpy array, with no NaN value
    # reason: s1_imbalance have most information in sign, sign can be think of as sign*abs**0. we optimize the power and get 1.424 as the best
    ####################################################################################


In [59]:
def triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged(current_data: dict,hist_list=[])->np.ndarray:
    def triplet_imbalance_cal(data,col):
        assert len(col)==3
        temp= np.vstack([data[:,col2index_map[col[0]]],data[:,col2index_map[col[1]]],data[:,col2index_map[col[2]]]])
        max_val = np.max(temp,axis=0)
        min_val = np.min(temp,axis=0)
        mid_val = np.sum(temp,axis=0)-max_val-min_val
        
        # print(max_val.shape)
        return (max_val-mid_val)/(max_val+mid_val+min_val)
    current_data = np.array(list(current_data.values()),
                            dtype=float).T  # this is faster than pd.DataFrame(current_data).values
    res = triplet_imbalance_cal(current_data,['bid_price','ask_price','wap'])
    res = np.sign(res)*np.abs(res)**0.65
    s1_imbalance = np.sign(current_data[:, col2index_map['ask_size']] - current_data[:, col2index_map['bid_size']])
    s1_imbalance[np.isnan(s1_imbalance)] = 0  # this is the fastest w to fill nan
    # res = utils.ols_res(s1_imbalance,res,0.5) # The return value MUSE BE a numpy array, with no NaN value
    res[np.isnan(res)] = 0  # this is the fastest way to fill nan
    return res

In [52]:
def s1_imbalance(current_data: dict, hist_list=[]) -> np.ndarray:
    '''
    This will be the main function to design your factors for the competition. Please
    define only one factor here each time. We provide you with:

    Current_data: a dictionary in the format of {column_name: column_value}, where column_name is from the original
    dataframe

    hist_list: A list for you to save the previous factor values (optional). For instance,
    if you are calculating a 100-day Moving Average (MA), then you can save the first calculated
    MA in hist_list, and then for the next MA calculation, you can use the saved ones.
    '''
    ###################### ADD YOUR CODE HERE FOR FACTORS DESIGN ######################
    # convert the current_data to your choice of numpy or pandas dataframe
    # current_data = pd.DataFrame(current_data)
    current_data = np.array(list(current_data.values()),
                            dtype=float).T  # this is faster than pd.DataFrame(current_data).values
    res = (current_data[:, col2index_map['bid_size']] - current_data[:, col2index_map['ask_size']]) 
    ############################## NAN/Inf handling ######################################
    # if you have nan in your factor value, please fill it reasonably
    # res = np.nan_to_num(res) # this is slow because it also checks for inf.
    # res = np.where(np.isnan(res), 0, res)  # this is slightly faster than np.nan_to_num
    res[np.isnan(res)] = 0  # this is the fastest way to fill nan
    ############################## END OF YOUR CODE ##############################
    return np.sign(-res)  # The return value MUSE BE a numpy array, with no NaN value
    ####################################################################################


In [60]:
new_factor_list = ['triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged']

In [61]:
# DO NOT MODIFY THE FOLLOWING CODE
# Run this cell once you want to calculate your factor values and prepare for the test of your factor performance!
new_factors = {
    factor_name: utils.flatten_factor_value(
        factor_design.run_factor_value(df_train_dic_sorted, eval(factor_name), factor_name), factor_name)[factor_name]
    for factor_name in tqdm(new_factor_list)}

  0%|          | 0/1 [00:00<?, ?it/s]

Start calculating factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged
Finished calculating factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged for 0 dates
Finished calculating factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged for 100 dates
Finished calculating factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged for 200 dates
Finished calculating factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged for 300 dates
Finished calculating factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged for 400 dates
Accepted!!: Used 18.35 seconds for calculation factors. The limit is 300 seconds.


100%|██████████| 1/1 [00:19<00:00, 19.40s/it]


In [62]:
# check if all values in new_factor do not contain nan, no NA is allowed in the factor value
for factor_name, factor_value in new_factors.items():
    assert not np.isnan(factor_value).any(), f'{factor_name} contains nan'
    assert not np.isinf(factor_value).any(), f'{factor_name} contains inf'
    print(f'{factor_name} has correlation to target {np.corrcoef(factor_value, df_train["target"].fillna(0))[0, 1]}')


triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged has correlation to target 0.10468547554709652


In [63]:
# simple check for pairwise correlation
for factor_name, factor_value in new_factors.items():
    for existed_factor_name, existed_factor_value in existed_factors.items():
        correlation = np.corrcoef(factor_value, existed_factor_value)[0, 1]
        if correlation > 0.4:# if correlation with existing factor < 0.4, there is no penalty for this factor
            print(f'{factor_name} has correlation to {existed_factor_name} {np.corrcoef(factor_value, existed_factor_value)[0, 1]}')
            print(f'    This existing factor have a correlation to target {np.corrcoef(existed_factor_value, df_train["target"].fillna(0))[0, 1]}')
            break
print('Other factors passed pairwise correlation < 0.4 requirement')

triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged has correlation to s1_imbalance_signed_pow_opt 0.5653820461585352
    This existing factor have a correlation to target 0.11812815373942646
Other factors passed pairwise correlation < 0.4 requirement


# Improving factor

### signed power optimization

In [17]:
import matplotlib.pyplot as plt

In [18]:
np.corrcoef(np.sign(new_factors['triplet_imbalance_price']),df_train['target'].fillna(0))[0,1]

KeyError: 'triplet_imbalance_price'

In [None]:
opt = [np.corrcoef(np.sign(new_factors['triplet_imbalance_price'])*np.abs(new_factors['triplet_imbalance_price'])**i,df_train['target'].fillna(0))[0,1] for i in np.linspace(0.05,5,100)]
plt.plot(np.linspace(0.05,5,100),opt)

In [None]:
max(opt)

In [None]:
np.linspace(0.05,5,100)[np.argmax(opt)]

### hedging

In [None]:
opt = [utils.ols_res(np.sign(new_factors['s1_imbalance']),new_factors['triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged'],i) for i in np.linspace(0.05,1,50)]
performance_opt = [np.corrcoef(item,df_train['target'].fillna(0))[0,1] for item in opt]
pairwise_opt = [np.corrcoef(item,np.sign(new_factors['s1_imbalance']))[0,1] for item in opt]

In [None]:
plt.plot(np.linspace(0.05,1,50),performance_opt,label='performance')
plt.plot(np.linspace(0.05,1,50),pairwise_opt,label='pairwise')
plt.legend()
plt.hlines(0.4,0,1,linestyles='dashed',colors='r')
plt.hlines(0.05,0,1,linestyles='dashed',colors='r')
plt.show()

In [None]:
np.abs(np.array(pairwise_opt)-0.4)

In [None]:
np.linspace(0.05,1,50)[np.argmin(np.abs(np.array(pairwise_opt)-0.4))]

# Factor Backtesting

In [64]:
demo_backtest = factor_backtest.Factor_Backtest(
    existed_factors=existed_factors,
    testing_factors=new_factors,
    factor_performance=factor_performance
)

In [65]:
demo_backtest.run_testing()


Start testing factor: triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged..........

_______________________________________Factor Performance_______________________________________________________
Factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged PASSED in-sample performance check with correlation coefficient 0.10468547554709652

___________________________________Factor pairwise Correlation__________________________________________________
Factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged PASSED in-sample pairwise correlation check with all existed factors
______________________________________________________________________________________________

_______________________________________Factor Conclusion_______________________________________________________
Factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged PASSED


______________________________________Overall Conclusion________________________________________________________
Factors passed all the test