In [55]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
# We will import the necessary libraries here. You can import any librariese you need for your beautiful factors!
from in_house_validation import utils, factor_design, factor_backtest
import numpy as np
import pandas as pd
from tqdm import tqdm
import queue
from scipy.stats import stats

# Load data

In [57]:
df_train = pd.read_csv('research_train_set.csv')
df_test = pd.read_csv('research_test_set.csv')

In [58]:
col2index_map = utils.load_json('./in_house_validation/col2index_map.json')

In [59]:
df_train_dic_sorted = utils.load_json('./in_house_validation/df_train_dic_sorted.json')
df_test_dic_sorted = utils.load_json('./in_house_validation/test_data_dic_sorted.json')

In [60]:
existed_factor_train = utils.load_json('./in_house_validation/existed_factors.json')
existed_factor_test = utils.load_json('./in_house_validation/existed_factor_test.json')

In [61]:
factor_performance_train = utils.load_json('./in_house_validation/factor_performance.json')
factor_performance_test = utils.load_json('./in_house_validation/factor_performance_test.json')

# New factor here

In [62]:
def s1_imbalance_signed_pow_opt(current_data: dict, hist_list=[]) -> np.ndarray:
    '''
    This will be the main function to design your factors for the competition. Please
    define only one factor here each time. We provide you with:

    Current_data: a dictionary in the format of {column_name: column_value}, where column_name is from the original
    dataframe

    hist_list: A list for you to save the previous factor values (optional). For instance,
    if you are calculating a 100-day Moving Average (MA), then you can save the first calculated
    MA in hist_list, and then for the next MA calculation, you can use the saved ones.
    '''
    ###################### ADD YOUR CODE HERE FOR FACTORS DESIGN ######################
    # convert the current_data to your choice of numpy or pandas dataframe
    # current_data = pd.DataFrame(current_data)
    current_data = np.array(list(current_data.values()),
                            dtype=float).T  # this is faster than pd.DataFrame(current_data).values
    res = (current_data[:, col2index_map['bid_size']] - current_data[:, col2index_map['ask_size']]) / \
          (current_data[:, col2index_map['bid_size']] + current_data[:, col2index_map['ask_size']])
    ############################## NAN/Inf handling ######################################
    # if you have nan in your factor value, please fill it reasonably
    # res = np.nan_to_num(res) # this is slow because it also checks for inf.
    # res = np.where(np.isnan(res), 0, res)  # this is slightly faster than np.nan_to_num
    res[np.isnan(res)] = 0  # this is the fastest way to fill nan
    ############################## END OF YOUR CODE ##############################
    return np.sign(-res) * np.abs(
        res) ** 1.4242424242424243  # The return value MUSE BE a numpy array, with no NaN value
    # reason: s1_imbalance have most information in sign, sign can be think of as sign*abs**0. we optimize the power and get 1.424 as the best
    ####################################################################################


In [63]:
def past_10s_ret(current_data: dict, hist_list=[]):
    current_data = np.array(list(current_data.values()),
                            dtype=float).T  # this is faster than pd.DataFrame(current_data).values
    current_time = current_data[0, col2index_map['seconds_in_bucket']]
    current_wap = current_data[:, col2index_map['wap']]
    if current_time < 10:
        if len(hist_list) == 0:
            hist_list.append(
                queue.Queue())  # if we directly claim hist_list = [queue.Queue()], it will be a local variable
        else:
            hist_list[0].queue.clear()
        hist_list[0].put(current_wap)
        res = np.zeros(current_wap.shape)
    else:
        hist_list[0].put(current_wap)
        # calculate the past 60s return
        past_wap = hist_list[0].get()
        res = (current_wap - past_wap) / past_wap
    res[np.isnan(res)] = 0  # this is the easiest way to fill nan
    return -res

In [64]:
def past_30s_ret_rank(current_data: dict, hist_list=[]):
    current_data = np.array(list(current_data.values()),
                            dtype=float).T  # this is faster than pd.DataFrame(current_data).values
    current_time = current_data[0, col2index_map['seconds_in_bucket']]
    current_wap = current_data[:, col2index_map['wap']]
    if current_time < 30:
        if current_time == 0:
            if len(hist_list) == 0:
                hist_list.append(queue.Queue())
            else:
                hist_list[0].queue.clear()
        hist_list[0].put(current_wap)
        res = np.zeros(current_wap.shape)
    else:
        hist_list[0].put(current_wap)
        # calculate the past 60s return
        past_wap = hist_list[0].get()
        res = (current_wap - past_wap) / past_wap
    res[np.isnan(res)] = 0  # this is the easiest way to fill nan
    return -stats.rankdata(res, method='min') / (len(res) + 1)

In [65]:
def stock_20s_ret_deviate_from_index(current_data: dict, hist_list=[]):
    current_data = np.array(list(current_data.values()),
                            dtype=float).T  # this is faster than pd.DataFrame(current_data).values
    current_time = current_data[0, col2index_map['seconds_in_bucket']]
    current_wap = current_data[:, col2index_map['wap']]
    if len(hist_list) == 0:
        hist_list.append([
            0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
            0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
            0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
            0.004, 0.004, 0.006, 0.002, 0.002, 0.04, 0.002, 0.002, 0.004, 0.04, 0.002, 0.001,
            0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
            0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
            0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
            0.02, 0.004, 0.006, 0.002, 0.02, 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
            0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
            0.004, 0.006, 0.006, 0.001, 0.04, 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
            0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
            0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
            0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
            0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
            0.04, 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02, 0.004, 0.002, 0.006, 0.02,
            0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
            0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
        ])
    if current_time < 20:
        if current_time == 0:
            if len(hist_list) == 1:
                hist_list.append(queue.Queue())
            else:
                hist_list[1].queue.clear()
        hist_list[1].put(current_wap)
        ret = np.zeros(current_wap.shape)
    else:
        hist_list[1].put(current_wap)
        # calculate the past 60s return
        past_wap = hist_list[1].get()
        ret = (current_wap - past_wap) / past_wap
    # for robustness, if the stock_id is larger than 200, we use the average weight
    weight = [hist_list[0][int(stock_id)] if stock_id <= 200 else np.mean(hist_list[0]) for stock_id in
              current_data[:, col2index_map['stock_id']]]
    synthetic_index = np.sum(np.array(weight, dtype=float) * ret)
    res = ret - synthetic_index
    # if result is nan, we assign res=0
    res[np.isnan(res)] = 0  # this is the easiest way to fill nan
    return -res

In [66]:
def stock_40s_ret_deviate_from_index_rank_hedged_opt(current_data: dict, hist_list=[]):
    current_data = np.array(list(current_data.values()),
                            dtype=float).T  # this is faster than pd.DataFrame(current_data).values
    current_time = current_data[0, col2index_map['seconds_in_bucket']]
    current_wap = current_data[:, col2index_map['wap']]
    if len(hist_list) == 0:
        hist_list.append([
            0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
            0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
            0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
            0.004, 0.004, 0.006, 0.002, 0.002, 0.04, 0.002, 0.002, 0.004, 0.04, 0.002, 0.001,
            0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
            0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
            0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
            0.02, 0.004, 0.006, 0.002, 0.02, 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
            0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
            0.004, 0.006, 0.006, 0.001, 0.04, 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
            0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
            0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
            0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
            0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
            0.04, 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02, 0.004, 0.002, 0.006, 0.02,
            0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
            0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
        ])
    if current_time == 0:
        if len(hist_list) == 1:
            # then we are at the very start, initialize the hist_list with two queues, one for 40s ret one for 20s ret
            hist_list.append(queue.Queue())  # for the 40s ret
            hist_list.append(queue.Queue())  # for the 20s ret
        else:
            # its a new day, we clear the queue
            hist_list[1].queue.clear()
            hist_list[2].queue.clear()
    if current_time < 20:
        hist_list[2].put(current_wap)
        hedge_20s_ret = np.zeros(current_wap.shape)
    else:
        hist_list[2].put(current_wap)
        # calculate the past 60s return
        past_wap = hist_list[2].get()
        hedge_20s_ret = (current_wap - past_wap) / past_wap
    if current_time < 40:
        hist_list[1].put(current_wap)
        ret = np.zeros(current_wap.shape)
    else:
        hist_list[1].put(current_wap)
        # calculate the past 60s return
        past_wap = hist_list[1].get()
        ret = (current_wap - past_wap) / past_wap
    # for robustness, if the stock_id is larger than 200, we use the average weight
    weight = [hist_list[0][int(stock_id)] if stock_id <= 200 else np.mean(hist_list[0]) for stock_id in
              current_data[:, col2index_map['stock_id']]]
    synthetic_index = np.sum(np.array(weight, dtype=float) * ret)
    res = ret - synthetic_index
    hedge_20s_ret = (hedge_20s_ret - np.sum(np.array(weight, dtype=float) * hedge_20s_ret))
    # if result is nan, we assign res=0
    res[np.isnan(res)] = 0  # this is the easiest way to fill nan
    res = stats.rankdata(-res) / len(res)
    res = (res - 0.5) * 2
    res = utils.ols_res(hedge_20s_ret, res, 0.33)
    res[np.isnan(res)] = 0  # this is the easiest way to fill nan

    return res

In [67]:
def stock_40s_ret_deviate_from_index_rank(current_data: dict, hist_list=[]):
    current_data = np.array(list(current_data.values()),
                            dtype=float).T  # this is faster than pd.DataFrame(current_data).values
    current_time = current_data[0, col2index_map['seconds_in_bucket']]
    current_wap = current_data[:, col2index_map['wap']]
    if len(hist_list) == 0:
        hist_list.append([
            0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
            0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
            0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
            0.004, 0.004, 0.006, 0.002, 0.002, 0.04, 0.002, 0.002, 0.004, 0.04, 0.002, 0.001,
            0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
            0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
            0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
            0.02, 0.004, 0.006, 0.002, 0.02, 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
            0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
            0.004, 0.006, 0.006, 0.001, 0.04, 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
            0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
            0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
            0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
            0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
            0.04, 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02, 0.004, 0.002, 0.006, 0.02,
            0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
            0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
        ])
    if current_time < 40:
        if current_time == 0:
            if len(hist_list) == 1:
                hist_list.append(queue.Queue())
            else:
                hist_list[1].queue.clear()
        hist_list[1].put(current_wap)
        ret = np.zeros(current_wap.shape)
    else:
        hist_list[1].put(current_wap)
        # calculate the past 60s return
        past_wap = hist_list[1].get()
        ret = (current_wap - past_wap) / past_wap
    # for robustness, if the stock_id is larger than 200, we use the average weight
    weight = [hist_list[0][int(stock_id)] if stock_id <= 200 else np.mean(hist_list[0]) for stock_id in
              current_data[:, col2index_map['stock_id']]]
    synthetic_index = np.sum(np.array(weight, dtype=float) * ret)
    res = ret - synthetic_index
    # if result is nan, we assign res=0
    res[np.isnan(res)] = 0  # this is the easiest way to fill nan
    return stats.rankdata(-res) / len(res)

In [68]:
new_factor_list = ['s1_imbalance_signed_pow_opt',
                   "past_10s_ret",
                   "past_30s_ret_rank",
                   "stock_20s_ret_deviate_from_index",
                   "stock_40s_ret_deviate_from_index_rank_hedged_opt",
                   "stock_40s_ret_deviate_from_index_rank", ]  # fine toned factor set

### Factor value calculation in train set

In [69]:
new_factors_train = {factor_name: utils.flatten_factor_value(
    factor_design.run_factor_value(df_train_dic_sorted, eval(factor_name), factor_name), factor_name)[factor_name] for
                     factor_name in tqdm(new_factor_list)}

  0%|          | 0/6 [00:00<?, ?it/s]

Number of dates: 401
Start calculating factor s1_imbalance_signed_pow_opt
Finished calculating factor s1_imbalance_signed_pow_opt for 0 dates


  res = (current_data[:, col2index_map['bid_size']] - current_data[:, col2index_map['ask_size']]) / \


Finished calculating factor s1_imbalance_signed_pow_opt for 100 dates
Finished calculating factor s1_imbalance_signed_pow_opt for 200 dates
Finished calculating factor s1_imbalance_signed_pow_opt for 300 dates
Finished calculating factor s1_imbalance_signed_pow_opt for 400 dates
Accepted!!: Used 17.33 seconds for calculation factors. The limit is 300 seconds.


 17%|█▋        | 1/6 [00:17<01:29, 17.97s/it]

Number of dates: 401
Start calculating factor past_10s_ret
Finished calculating factor past_10s_ret for 0 dates
Finished calculating factor past_10s_ret for 100 dates
Finished calculating factor past_10s_ret for 200 dates
Finished calculating factor past_10s_ret for 300 dates
Finished calculating factor past_10s_ret for 400 dates
Accepted!!: Used 3.98 seconds for calculation factors. The limit is 300 seconds.


  return -stats.rankdata(res, method='min') / (len(res) + 1)


Number of dates: 401
Start calculating factor past_30s_ret_rank
Finished calculating factor past_30s_ret_rank for 0 dates
Finished calculating factor past_30s_ret_rank for 100 dates
Finished calculating factor past_30s_ret_rank for 200 dates
Finished calculating factor past_30s_ret_rank for 300 dates
Finished calculating factor past_30s_ret_rank for 400 dates
Accepted!!: Used 6.30 seconds for calculation factors. The limit is 300 seconds.


 50%|█████     | 3/6 [00:29<00:25,  8.64s/it]

Number of dates: 401
Start calculating factor stock_20s_ret_deviate_from_index
Finished calculating factor stock_20s_ret_deviate_from_index for 0 dates
Finished calculating factor stock_20s_ret_deviate_from_index for 100 dates
Finished calculating factor stock_20s_ret_deviate_from_index for 200 dates
Finished calculating factor stock_20s_ret_deviate_from_index for 300 dates
Finished calculating factor stock_20s_ret_deviate_from_index for 400 dates
Accepted!!: Used 4.87 seconds for calculation factors. The limit is 300 seconds.


  res = stats.rankdata(-res) / len(res)
  beta = np.dot(x,y)/np.dot(x,x)


Number of dates: 401
Start calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt
Finished calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt for 0 dates
Finished calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt for 100 dates
Finished calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt for 200 dates
Finished calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt for 300 dates
Finished calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt for 400 dates
Accepted!!: Used 7.96 seconds for calculation factors. The limit is 300 seconds.


  return stats.rankdata(-res) / len(res)


Number of dates: 401
Start calculating factor stock_40s_ret_deviate_from_index_rank
Finished calculating factor stock_40s_ret_deviate_from_index_rank for 0 dates
Finished calculating factor stock_40s_ret_deviate_from_index_rank for 100 dates
Finished calculating factor stock_40s_ret_deviate_from_index_rank for 200 dates
Finished calculating factor stock_40s_ret_deviate_from_index_rank for 300 dates
Finished calculating factor stock_40s_ret_deviate_from_index_rank for 400 dates
Accepted!!: Used 7.17 seconds for calculation factors. The limit is 300 seconds.


100%|██████████| 6/6 [00:51<00:00,  8.51s/it]


### Factor value calculation in test set

In [70]:
new_factors_test = {factor_name: utils.flatten_factor_value(
    factor_design.run_factor_value(df_test_dic_sorted, eval(factor_name), factor_name), factor_name)[factor_name] for
                    factor_name in tqdm(new_factor_list)}

  0%|          | 0/6 [00:00<?, ?it/s]

Number of dates: 80
Start calculating factor s1_imbalance_signed_pow_opt


  res = (current_data[:, col2index_map['bid_size']] - current_data[:, col2index_map['ask_size']]) / \
 17%|█▋        | 1/6 [00:03<00:15,  3.02s/it]

Finished calculating factor s1_imbalance_signed_pow_opt for 480 dates
Accepted!!: Used 2.92 seconds for calculation factors. The limit is 300 seconds.
Number of dates: 80
Start calculating factor past_10s_ret


 33%|███▎      | 2/6 [00:03<00:07,  1.76s/it]

Finished calculating factor past_10s_ret for 480 dates
Accepted!!: Used 0.74 seconds for calculation factors. The limit is 300 seconds.
Number of dates: 80
Start calculating factor past_30s_ret_rank


  return -stats.rankdata(res, method='min') / (len(res) + 1)
 50%|█████     | 3/6 [00:05<00:04,  1.58s/it]

Finished calculating factor past_30s_ret_rank for 480 dates
Accepted!!: Used 1.27 seconds for calculation factors. The limit is 300 seconds.
Number of dates: 80
Start calculating factor stock_20s_ret_deviate_from_index


 67%|██████▋   | 4/6 [00:06<00:02,  1.37s/it]

Finished calculating factor stock_20s_ret_deviate_from_index for 480 dates
Accepted!!: Used 0.96 seconds for calculation factors. The limit is 300 seconds.
Number of dates: 80
Start calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt


  res = stats.rankdata(-res) / len(res)
 83%|████████▎ | 5/6 [00:08<00:01,  1.49s/it]

Finished calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt for 480 dates
Accepted!!: Used 1.61 seconds for calculation factors. The limit is 300 seconds.
Number of dates: 80
Start calculating factor stock_40s_ret_deviate_from_index_rank


  return stats.rankdata(-res) / len(res)
100%|██████████| 6/6 [00:09<00:00,  1.60s/it]

Finished calculating factor stock_40s_ret_deviate_from_index_rank for 480 dates
Accepted!!: Used 1.49 seconds for calculation factors. The limit is 300 seconds.





## Factor Backtesting

In [71]:
demo_backtest = factor_backtest.Factor_Backtest(existed_factors=existed_factor_train, testing_factors=new_factors_train,
                                                factor_performance=factor_performance_train)  # just a formality 

In [72]:
print(existed_factor_train.keys())

dict_keys(['origin_seconds_in_bucket', 'origin_imbalance_size', 'origin_imbalance_buy_sell_flag', 'origin_reference_price', 'origin_matched_size', 'origin_far_price', 'origin_near_price', 'origin_bid_price', 'origin_bid_size', 'origin_ask_price', 'origin_ask_size', 'origin_wap', 's1_imbalance_signed_pow_opt'])


In [73]:
demo_backtest.validate_new_factors(new_factors_train, new_factors_test)


_______________________________________Factor Performance_______________________________________________________
Factor s1_imbalance_signed_pow_opt PASSED in-sample performance check with correlation coefficient 0.11812815373942646

___________________________________Factor pairwise Correlation__________________________________________________
Factor s1_imbalance_signed_pow_opt FAILED in-sample correlation check with factor s1_imbalance_signed_pow_opt, 
The pair have correlation 0.9999999999999998
This falls in type 2 threshold, >0.7

The required performance is 1.2 times the old factor performance 0.11812815373942646 = 0.14175378448731174.
Performance of the new factor now is  0.11812815373942646
Factor s1_imbalance_signed_pow_opt failed in-sample tests

_______________________________________Factor Performance_______________________________________________________
Factor past_10s_ret PASSED in-sample performance check with correlation coefficient 0.06664332162654614

_______________

In [74]:
existed_factor_test.keys()

dict_keys(['origin_seconds_in_bucket', 'origin_imbalance_size', 'origin_imbalance_buy_sell_flag', 'origin_reference_price', 'origin_matched_size', 'origin_far_price', 'origin_near_price', 'origin_bid_price', 'origin_bid_size', 'origin_ask_price', 'origin_ask_size', 'origin_wap'])

In [75]:
existed_factor_train['stock_20s_ret_deviate_from_index']

array([-0.00000000e+00, -0.00000000e+00, -0.00000000e+00, ...,
        5.85119705e-05,  1.20540846e-04,  1.48540061e-04])

In [87]:
existed_factor_test = utils.add_factor_to_existed_test()

matching files: []
No temporary files found, no new factors added, returning existed_factors


In [88]:
existed_factor_test.keys()

dict_keys(['origin_seconds_in_bucket', 'origin_imbalance_size', 'origin_imbalance_buy_sell_flag', 'origin_reference_price', 'origin_matched_size', 'origin_far_price', 'origin_near_price', 'origin_bid_price', 'origin_bid_size', 'origin_ask_price', 'origin_ask_size', 'origin_wap', 's1_imbalance_signed_pow_opt', 'stock_20s_ret_deviate_from_index', 'stock_40s_ret_deviate_from_index_rank_hedged_opt'])

In [89]:
existed_factor_train = utils.add_factor_to_existed_train()

matching files: ['./in_house_validation\\tmp20231112.json', './in_house_validation\\tmp2023118.json']
Factor stock_20s_ret_deviate_from_index is a new factor, adding the value
Factor stock_40s_ret_deviate_from_index_rank_hedged_opt is a new factor, adding the value
./in_house_validation\tmp20231112.json added to existed_factors
Factor s1_imbalance_signed_pow_opt already exists in existed_factors, overwriting the value
./in_house_validation\tmp2023118.json added to existed_factors
storing the updated existed_factors
existed_factors updated, all temporary files deleted


In [90]:
existed_factor_train.keys()

dict_keys(['origin_seconds_in_bucket', 'origin_imbalance_size', 'origin_imbalance_buy_sell_flag', 'origin_reference_price', 'origin_matched_size', 'origin_far_price', 'origin_near_price', 'origin_bid_price', 'origin_bid_size', 'origin_ask_price', 'origin_ask_size', 'origin_wap', 's1_imbalance_signed_pow_opt', 'stock_20s_ret_deviate_from_index', 'stock_40s_ret_deviate_from_index_rank_hedged_opt'])