In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from factor_design_ver3_beta import utils, factor_design, factor_backtest
import pandas as pd
import numpy as np
from tqdm import tqdm

# load all relevant resources 

In [3]:
# play around the data 
df_train = pd.read_csv('./research_train_set.csv')

# We built a index map to speed up the factor design, you don't need to worry about this.
col2index_map = utils.load_json('./factor_design_ver3_alpha/col2index_map.json')
# df_train in a dictionary format
df_train_dic_sorted = utils.load_json('./factor_design_ver3_alpha/df_train_dic_sorted.json')

In [4]:
# please add all tmpYYYMMDD.json files to factor_design_ver3_beta folder, following code will load the passed factors to the existed factors
# roughly 1 min
utils.add_factor_to_existed()

matching files: ['./factor_design_ver3_beta\\tmp2023118.json']
./factor_design_ver3_beta\tmp2023118.json added to existed_factors


In [5]:
existed_factors = utils.load_json_factors('./factor_design_ver3_beta/existed_factors.json')
factor_performance = utils.load_json('./factor_design_ver3_beta/factor_performance.json')

# Factor Design

In [6]:
# sample factor
def s1_imbalance(current_data: dict, hist_list=[]) -> np.ndarray:
    '''
    This will be the main function to design your factors for the competition. Please
    define only one factor here each time. We provide you with:

    Current_data: a dictionary in the format of {column_name: column_value}, where column_name is from the original
    dataframe

    hist_list: A list for you to save the previous factor values (optional). For instance,
    if you are calculating a 100-day Moving Average (MA), then you can save the first calculated
    MA in hist_list, and then for the next MA calculation, you can use the saved ones.
    '''
    ###################### ADD YOUR CODE HERE FOR FACTORS DESIGN ######################
    # convert the current_data to your choice of numpy or pandas dataframe
    # current_data = pd.DataFrame(current_data)
    current_data = np.array(list(current_data.values()),
                            dtype=float).T  # this is faster than pd.DataFrame(current_data).values
    res = (current_data[:, col2index_map['bid_size']] - current_data[:, col2index_map['ask_size']]) / \
          (current_data[:, col2index_map['bid_size']] + current_data[:, col2index_map['ask_size']])
    ############################## NAN/Inf handling ######################################
    # if you have nan in your factor value, please fill it reasonably
    # res = np.nan_to_num(res) # this is slow because it also checks for inf.
    # res = np.where(np.isnan(res), 0, res)  # this is slightly faster than np.nan_to_num
    res[np.isnan(res)] = 0  # this is the fastest way to fill nan
    ############################## END OF YOUR CODE ##############################
    return -res  # The return value MUSE BE a numpy array, with no NaN value
    ####################################################################################


In [7]:
new_factor_list = ['s1_imbalance']

In [8]:
# DO NOT MODIFY THE FOLLOWING CODE
# Run this cell once you want to calculate your factor values and prepare for the test of your factor performance!
new_factors = {
    factor_name: utils.flatten_factor_value(
        factor_design.run_factor_value(df_train_dic_sorted, eval(factor_name), factor_name), factor_name)[factor_name]
    for factor_name in tqdm(new_factor_list)}

  0%|          | 0/1 [00:00<?, ?it/s]

Start calculating factor s1_imbalance
Finished calculating factor s1_imbalance for 0 dates


  res = (current_data[:, col2index_map['bid_size']] - current_data[:, col2index_map['ask_size']]) / \


Finished calculating factor s1_imbalance for 100 dates
Finished calculating factor s1_imbalance for 200 dates
Finished calculating factor s1_imbalance for 300 dates
Finished calculating factor s1_imbalance for 400 dates
Accepted!!: Used 11.05 seconds for calculation factors. The limit is 300 seconds.


100%|██████████| 1/1 [00:11<00:00, 11.70s/it]


In [9]:
# check if all values in new_factor and original_factor_dict does not contain nan, no NA is allowed in the factor value
for factor_name, factor_value in new_factors.items():
    assert not np.isnan(factor_value).any(), f'{factor_name} contains nan'
    assert not np.isinf(factor_value).any(), f'{factor_name} contains inf'
    print(f'{factor_name} has correlation to target {np.corrcoef(factor_value, df_train["target"].fillna(0))[0, 1]}')


s1_imbalance has correlation to target 0.11723116519487425


In [10]:
pd.DataFrame(new_factors).corr()

Unnamed: 0,s1_imbalance
s1_imbalance,1.0


# Existed factors

In [13]:
existed_factors.keys()

dict_keys(['origin_seconds_in_bucket', 'origin_imbalance_size', 'origin_imbalance_buy_sell_flag', 'origin_reference_price', 'origin_matched_size', 'origin_far_price', 'origin_near_price', 'origin_bid_price', 'origin_bid_size', 'origin_ask_price', 'origin_ask_size', 'origin_wap', 's1_imbalance_signed_pow_opt'])

# Factor Backtesting

In [11]:
demo_backtest = factor_backtest.Factor_Backtest(
    existed_factors=existed_factors,
    testing_factors=new_factors,
    factor_performance=factor_performance
)

In [12]:
demo_backtest.run_testing()

Start testing factor: s1_imbalance..........
Factor s1_imbalance passed in-sample performance check with correlation coefficient 0.11723116519487425
{'origin_seconds_in_bucket': -0.00275351467424382, 'origin_imbalance_size': -0.007912562170556144, 'origin_imbalance_buy_sell_flag': 0.0072728122913709396, 'origin_reference_price': 0.003089423955432897, 'origin_matched_size': -0.0029542783068026088, 'origin_far_price': -0.0013442725990691496, 'origin_near_price': -0.001989082554881222, 'origin_bid_price': 0.0032888458109110777, 'origin_bid_size': 0.017996957016120273, 'origin_ask_price': 0.004788753844761137, 'origin_ask_size': 0.014149922396686877, 'origin_wap': 0.0070193634593688865, 's1_imbalance_signed_pow_opt': 0.11812815373942646, 's1_imbalance': 0.11723116519487425}
Factor s1_imbalance failed in-sample correlation check with factor s1_imbalance_signed_pow_opt
Factor s1_imbalance has correlation coefficient 0.992082905345458 with factor s1_imbalance_signed_pow_opt, and sowe need the