In [294]:
import queue
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [295]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
from tqdm import tqdm
from scipy import stats

In [296]:
from factor_design_distributed_ver1 import utils, factor_design, factor_backtest

In [297]:
# play around the data 
df_train = pd.read_csv('./research_train_set.csv')

# We built a index map to speed up the factor design, you don't need to worry about this.
col2index_map = utils.load_json('./factor_design_distributed_ver1/col2index_map.json')
# df_train in a dictionary format
df_train_dic_sorted = utils.load_json('./factor_design_distributed_ver1/df_train_dic_sorted.json')

In [298]:
existed_factors = utils.add_factor_to_existed()  # this function will add all temp files to the existed factor, then return the existed factor. usually take 1 min

matching files: []
No temporary files found, no new factors added, returning existed_factors


In [299]:
factor_df = pd.DataFrame(existed_factors)

In [300]:
# check if any na in the factor_df
factor_df.isna().sum()

origin_seconds_in_bucket                               0
origin_imbalance_size                                  0
origin_imbalance_buy_sell_flag                         0
origin_reference_price                                 0
origin_matched_size                                    0
origin_far_price                                       0
origin_near_price                                      0
origin_bid_price                                       0
origin_bid_size                                        0
origin_ask_price                                       0
origin_ask_size                                        0
origin_wap                                             0
s1_imbalance_signed_pow_opt                            0
stock_20s_ret_deviate_from_index                       0
stock_40s_ret_deviate_from_index_rank_hedged_opt       0
triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged    0
price_pressure                                         0
dtype: int64

# reconstruct original data factor using ffill method

In [301]:
df_train = pd.read_csv('./research_train_set.csv')

In [302]:
df_train.isna().sum()

stock_id                         0
date_id                          0
seconds_in_bucket                0
imbalance_size                 165
imbalance_buy_sell_flag          0
reference_price                165
matched_size                   165
far_price                  2408871
near_price                 2377155
bid_price                      165
bid_size                         0
ask_price                      165
ask_size                         0
wap                            165
target                          87
time_id                          0
row_id                           0
dtype: int64

In [303]:
def ffill_current_data(current_data: dict, hist_list: list):
    current_data = np.array(list(current_data.values()),
                            dtype=float).T
    if len(hist_list) == 0:
        # print("initialize")
        old_data = np.full((300, current_data.shape[1]), np.nan, dtype=float)
        old_data[:, 0] = np.arange(0, 300)
        rows_to_update = np.isin(old_data[:, 0], current_data[:, 0])
        old_data[rows_to_update, 1:] = current_data[np.searchsorted(current_data[:, 0], old_data[rows_to_update, 0]),
                                       1:]
        hist_list.append(old_data)  # initializing the hist_list
        return current_data
    else:
        old_data = hist_list[0]
        # first fill nan with old data
        coordinate_to_replace = np.argwhere(np.isnan(current_data))
        current_data[coordinate_to_replace[:, 0], coordinate_to_replace[:, 1]] = old_data[
            coordinate_to_replace[:, 0], coordinate_to_replace[:, 1]]
        # then update old data with current data
        rows_to_update = np.isin(old_data[:, 0], current_data[:, 0])
        old_data[rows_to_update, 1:] = current_data[np.searchsorted(current_data[:, 0], old_data[rows_to_update, 0]),
                                       1:]
        hist_list[0] = old_data
        return current_data

In [304]:
def ffill_factor_res(factor_res: np.ndarray,stock_id: np.ndarray, hist_list: list, used_hist_list: int):
    current_res = np.hstack((stock_id.reshape(-1,1),factor_res.reshape(-1,1)))
    if len(hist_list) == used_hist_list:
        old_data = np.full((300, 2), np.nan, dtype=float)
        old_data[:, 0] = np.arange(0, 300)
        rows_to_update = np.isin(old_data[:, 0], current_res[:, 0])
        old_data[rows_to_update, 1:] = current_res[np.searchsorted(current_res[:, 0], old_data[rows_to_update, 0]),
                                       1:]
        hist_list.append(old_data)  # initializing the hist_list
        return current_res[:,1]
    else:
        old_data = hist_list[used_hist_list]
        # first fill nan with old data
        coordinate_to_replace = np.argwhere(np.isnan(current_res[:,1]))
        current_res[coordinate_to_replace[:, 0], 1] = old_data[
            coordinate_to_replace[:, 0], 1]
        # then update old data with current data
        rows_to_update = np.isin(old_data[:, 0], current_res[:, 0])

        old_data[rows_to_update, 1:] = current_res[np.searchsorted(current_res[:, 0], old_data[rows_to_update, 0]),
                                       1:]
        hist_list[used_hist_list] = old_data
        return current_res[:,1]
        
        

In [305]:
def origin_seconds_in_bucket(current_data: dict, hist_list=[]) -> np.ndarray:
    """same logic as s1_imbalance, but only use matched volume"""
    # get ffilled data
    current_data = ffill_current_data(current_data, hist_list)
    res = current_data[:, col2index_map['seconds_in_bucket']]
    # after firstfill, if there is still nan (which can happen if the data start with nan), then fill with 0
    res[np.isnan(res)] = 0
    return -res  # The return value MUSE BE a numpy array, with no NaN value

In [306]:
def origin_imbalance_size(current_data: dict, hist_list=[]) -> np.ndarray:
    """same logic as s1_imbalance, but only use matched volume"""
    # get ffilled data
    current_data = ffill_current_data(current_data, hist_list)
    res = current_data[:, col2index_map['imbalance_size']]
    # after firstfill, if there is still nan (which can happen if the data start with nan), then fill with 0
    res[np.isnan(res)] = 0
    return res  # The return value MUSE BE a numpy array, with no NaN value

In [307]:
def origin_imbalance_buy_sell_flag(current_data: dict, hist_list=[]) -> np.ndarray:
    """same logic as s1_imbalance, but only use matched volume"""
    current_data = ffill_current_data(current_data, hist_list)
    res = current_data[:, col2index_map['imbalance_buy_sell_flag']]
    # after firstfill, if there is still nan (which can happen if the data start with nan), then fill with 0
    res[np.isnan(res)] = 0
    return res  # The return value MUSE BE a numpy array, with no NaN value

In [308]:
def origin_reference_price(current_data: dict, hist_list=[]) -> np.ndarray:
    """same logic as s1_imbalance, but only use matched volume"""
    
    current_data = ffill_current_data(current_data, hist_list)
    res = current_data[:, col2index_map['reference_price']]
    # after firstfill, if there is still nan (which can happen if the data start with nan), then fill with 0
    res[np.isnan(res)] = 0
    return -res  # The return value MUSE BE a numpy array, with no NaN value

In [309]:
def origin_matched_size(current_data: dict, hist_list=[]) -> np.ndarray:
    """same logic as s1_imbalance, but only use matched volume"""
    current_data = ffill_current_data(current_data, hist_list)
    res = current_data[:, col2index_map['matched_size']]
    # after firstfill, if there is still nan (which can happen if the data start with nan), then fill with 0
    res[np.isnan(res)] = 0
    return res  # The return value MUSE BE a numpy array, with no NaN value

In [310]:
def origin_far_price(current_data: dict, hist_list=[]) -> np.ndarray:
    """same logic as s1_imbalance, but only use matched volume"""
    current_data = ffill_current_data(current_data, hist_list)
    res = current_data[:, col2index_map['far_price']]
    # after firstfill, if there is still nan (which can happen if the data start with nan), then fill with 0
    res[np.isnan(res)] = 0
    return res  # The return value MUSE BE a numpy array, with no NaN value

In [311]:
def origin_near_price(current_data: dict, hist_list=[]) -> np.ndarray:
    """same logic as s1_imbalance, but only use matched volume"""
    current_data = ffill_current_data(current_data, hist_list)
    res = current_data[:, col2index_map['near_price']]
    # after firstfill, if there is still nan (which can happen if the data start with nan), then fill with 0
    res[np.isnan(res)] = 0
    return res  # The return value MUSE BE a numpy array, with no NaN value

In [312]:
def origin_bid_price(current_data: dict, hist_list=[]) -> np.ndarray:
    """same logic as s1_imbalance, but only use matched volume"""
    current_data = ffill_current_data(current_data, hist_list)
    res = current_data[:, col2index_map['bid_price']]
    # after firstfill, if there is still nan (which can happen if the data start with nan), then fill with 0
    res[np.isnan(res)] = 0
    return -res  # The return value MUSE BE a numpy array, with no NaN value

In [313]:
def origin_bid_size(current_data: dict, hist_list=[]) -> np.ndarray:
    """same logic as s1_imbalance, but only use matched volume"""
    current_data = ffill_current_data(current_data, hist_list)
    res = current_data[:, col2index_map['bid_size']]
    # after firstfill, if there is still nan (which can happen if the data start with nan), then fill with 0
    res[np.isnan(res)] = 0
    return -res  # The return value MUSE BE a numpy array, with no NaN value

In [314]:
def origin_ask_price(current_data: dict, hist_list=[]) -> np.ndarray:
    """same logic as s1_imbalance, but only use matched volume"""
    
    current_data = ffill_current_data(current_data, hist_list)
    res = current_data[:, col2index_map['ask_price']]
    # after firstfill, if there is still nan (which can happen if the data start with nan), then fill with 0
    # res[np.isnan(res)] = 0
    return -res  # The return value MUSE BE a numpy array, with no NaN value

In [315]:
def origin_ask_size(current_data: dict, hist_list=[]) -> np.ndarray:
    """same logic as s1_imbalance, but only use matched volume"""
    
    current_data = ffill_current_data(current_data, hist_list)
    res = current_data[:, col2index_map['ask_size']]
    # after firstfill, if there is still nan (which can happen if the data start with nan), then fill with 0
    # res[np.isnan(res)] = 0
    return res  # The return value MUSE BE a numpy array, with no NaN value

In [316]:
def origin_wap(current_data: dict, hist_list=[]) -> np.ndarray:
    """same logic as s1_imbalance, but only use matched volume"""
    
    current_data = ffill_current_data(current_data, hist_list)
    res = current_data[:, col2index_map['wap']]
    # after firstfill, if there is still nan (which can happen if the data start with nan), then fill with 0
    # res[np.isnan(res)] = 0
    return -res  # The return value MUSE BE a numpy array, with no NaN value

In [317]:
def s1_imbalance_signed_pow_opt(current_data: dict, hist_list=[]) -> np.ndarray:
    '''
    This will be the main function to design your factors for the competition. Please
    define only one factor here each time. We provide you with:

    Current_data: a dictionary in the format of {column_name: column_value}, where column_name is from the original
    dataframe

    hist_list: A list for you to save the previous factor values (optional). For instance,
    if you are calculating a 100-day Moving Average (MA), then you can save the first calculated
    MA in hist_list, and then for the next MA calculation, you can use the saved ones.
    '''
    ###################### ADD YOUR CODE HERE FOR FACTORS DESIGN ######################
    # convert the current_data to your choice of numpy or pandas dataframe
    # current_data = pd.DataFrame(current_data)
    current_data =ffill_current_data(current_data,hist_list)
    res = (current_data[:, col2index_map['bid_size']] - current_data[:, col2index_map['ask_size']]) / \
          (current_data[:, col2index_map['bid_size']] + current_data[:, col2index_map['ask_size']])
    res = np.sign(-res) * np.abs(
        res) ** 1.4242424242424243  
    res = ffill_factor_res(res,current_data[:, col2index_map['stock_id']],hist_list,1)
    res[np.isnan(res)] = 0  # this is the easiest way to fill nan
    return res
    

In [318]:
def stock_20s_ret_deviate_from_index(current_data: dict, hist_list=[]):
    current_data = ffill_current_data(current_data, hist_list)
    current_time = current_data[0, col2index_map['seconds_in_bucket']]
    current_wap = current_data[:, col2index_map['wap']]
    if len(hist_list) == 1:
        hist_list.append([
            0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
            0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
            0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
            0.004, 0.004, 0.006, 0.002, 0.002, 0.04, 0.002, 0.002, 0.004, 0.04, 0.002, 0.001,
            0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
            0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
            0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
            0.02, 0.004, 0.006, 0.002, 0.02, 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
            0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
            0.004, 0.006, 0.006, 0.001, 0.04, 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
            0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
            0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
            0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
            0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
            0.04, 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02, 0.004, 0.002, 0.006, 0.02,
            0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
            0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
        ])
    if current_time < 20:
        if current_time == 0:
            if len(hist_list) == 2:
                hist_list.append(queue.Queue())
            else:
                hist_list[2].queue.clear()
        hist_list[2].put(current_wap)
        ret = np.zeros(current_wap.shape)
    else:
        hist_list[2].put(current_wap)
        # calculate the past 60s return
        past_wap = hist_list[2].get()
        ret = (current_wap - past_wap) / past_wap
    # for robustness, if the stock_id is larger than 200, we use the average weight
    weight = [hist_list[1][int(stock_id)] if stock_id <= 200 else np.mean(hist_list[1]) for stock_id in
              current_data[:, col2index_map['stock_id']]]
    synthetic_index = np.sum(np.array(weight, dtype=float) * ret)
    res = ret - synthetic_index
    res = ffill_factor_res(res,current_data[:, col2index_map['stock_id']],hist_list,3)
    # if result is nan, we assign res=0
    res[np.isnan(res)] = 0  # this is the easiest way to fill nan
    return -res

In [319]:
def stock_40s_ret_deviate_from_index_rank_hedged_opt(current_data: dict, hist_list=[]):
    current_data = ffill_current_data(current_data, hist_list)
    current_time = current_data[0, col2index_map['seconds_in_bucket']]
    current_wap = current_data[:, col2index_map['wap']]
    if len(hist_list) == 1:
        hist_list.append([
            0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
            0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
            0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
            0.004, 0.004, 0.006, 0.002, 0.002, 0.04, 0.002, 0.002, 0.004, 0.04, 0.002, 0.001,
            0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
            0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
            0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
            0.02, 0.004, 0.006, 0.002, 0.02, 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
            0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
            0.004, 0.006, 0.006, 0.001, 0.04, 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
            0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
            0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
            0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
            0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
            0.04, 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02, 0.004, 0.002, 0.006, 0.02,
            0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
            0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
        ])
    if current_time == 0:
        if len(hist_list) == 2:
            # then we are at the very start, initialize the hist_list with two queues, one for 40s ret one for 20s ret
            hist_list.append(queue.Queue())  # for the 40s ret
            hist_list.append(queue.Queue())  # for the 20s ret
        else:
            # its a new day, we clear the queue
            hist_list[2].queue.clear()
            hist_list[3].queue.clear()
    if current_time < 20:
        hist_list[3].put(current_wap)
        hedge_20s_ret = np.zeros(current_wap.shape)
    else:
        hist_list[3].put(current_wap)
        # calculate the past 60s return
        past_wap = hist_list[3].get()
        hedge_20s_ret = (current_wap - past_wap) / past_wap
    if current_time < 40:
        hist_list[2].put(current_wap)
        ret = np.zeros(current_wap.shape)
    else:
        hist_list[2].put(current_wap)
        # calculate the past 60s return
        past_wap = hist_list[2].get()
        ret = (current_wap - past_wap) / past_wap
    # for robustness, if the stock_id is larger than 200, we use the average weight
    weight = [hist_list[1][int(stock_id)] if stock_id <= 200 else np.mean(hist_list[1]) for stock_id in
              current_data[:, col2index_map['stock_id']]]
    synthetic_index = np.sum(np.array(weight, dtype=float) * ret)
    res = ret - synthetic_index
    hedge_20s_ret = (hedge_20s_ret - np.sum(np.array(weight, dtype=float) * hedge_20s_ret))
    # if result is nan, we assign res=0
    res[np.isnan(res)] = 0  # this is the easiest way to fill nan
    res = stats.rankdata(-res) / len(res)
    res = (res - 0.5) * 2
    res = utils.ols_res(hedge_20s_ret, res, 0.33)
    res = ffill_factor_res(res,current_data[:, col2index_map['stock_id']],hist_list,4)
    res[np.isnan(res)] = 0  # this is the easiest way to fill nan

    return res

In [320]:
def triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged(current_data: dict,hist_list=[])->np.ndarray:
    def triplet_imbalance_cal(data,col):
        assert len(col)==3
        temp= np.vstack([data[:,col2index_map[col[0]]],data[:,col2index_map[col[1]]],data[:,col2index_map[col[2]]]])
        max_val = np.max(temp,axis=0)
        min_val = np.min(temp,axis=0)
        mid_val = np.sum(temp,axis=0)-max_val-min_val
        
        # print(max_val.shape)
        return (max_val-mid_val)/(max_val+mid_val+min_val)
    current_data = ffill_current_data(current_data,hist_list)
    res = triplet_imbalance_cal(current_data,['bid_price','ask_price','wap'])
    res = np.sign(res)*np.abs(res)**0.65
    # s1_imbalance = np.sign(current_data[:, col2index_map['ask_size']] - current_data[:, col2index_map['bid_size']])
    # s1_imbalance[np.isnan(s1_imbalance)] = 0  # this is the fastest w to fill nan
    # res = utils.ols_res(s1_imbalance,res,0.5) # The return value MUSE BE a numpy array, with no NaN value
    res = ffill_factor_res(res,current_data[:, col2index_map['stock_id']],hist_list,1)
    res[np.isnan(res)] = 0  # this is the fastest way to fill nan
    return res

In [321]:
def price_pressure(current_data: dict, hist_list=[]) -> np.ndarray:
    '''
    This will be the main function to design your factors for the competition. Please
    define only one factor here each time. We provide you with:

    Current_data: a dictionary in the format of {column_name: column_value}, where column_name is from the original
    dataframe

    hist_list: A list for you to save the previous factor values (optional). For instance,
    if you are calculating a 100-day Moving Average (MA), then you can save the first calculated
    MA in hist_list, and then for the next MA calculation, you can use the saved ones.
    '''
    ###################### ADD YOUR CODE HERE FOR FACTORS DESIGN ######################
    # convert the current_data to your choice of numpy or pandas dataframe
    # current_data = pd.DataFrame(current_data)
    current_data = ffill_current_data(current_data, hist_list)
    res = ((current_data[:, col2index_map['bid_size']] - current_data[:, col2index_map['ask_size']]) / \
          (current_data[:, col2index_map['bid_size']] + current_data[:, col2index_map['ask_size']])) * \
          (current_data[:, col2index_map['ask_price']] - current_data[:, col2index_map['bid_price']])
    ############################## NAN/Inf handling ######################################
    # if you have nan in your factor value, please fill it reasonably
    # res = np.nan_to_num(res) # this is slow because it also checks for inf.
    # res = np.where(np.isnan(res), 0, res)  # this is slightly faster than np.nan_to_num
    res = ffill_factor_res(res,current_data[:, col2index_map['stock_id']],hist_list,1)
    res[np.isnan(res)] = 0  # this is the fastest way to fill nan
    ############################## END OF YOUR CODE ##############################
    return -res  # The return value MUSE BE a numpy array, with no NaN value
    ####################################################################################


In [322]:
new_factor_list = ['origin_'+item for item in df_train.columns[2:-3]]
new_factor_list+=['s1_imbalance_signed_pow_opt','stock_20s_ret_deviate_from_index','stock_40s_ret_deviate_from_index_rank_hedged_opt','triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged','price_pressure']
#new_factor_list=['stock_40s_ret_deviate_from_index_rank_hedged_opt',]
# DO NOT MODIFY THE FOLLOWING CODE
# Run this cell once you want to calculate your factor values and prepare for the test of your factor performance!
new_factors = {
    factor_name: utils.flatten_factor_value(
        factor_design.run_factor_value(df_train_dic_sorted, eval(factor_name), factor_name), factor_name)[factor_name]
    for factor_name in tqdm(new_factor_list)}

  0%|          | 0/17 [00:00<?, ?it/s]

Start calculating factor origin_seconds_in_bucket
Finished calculating factor origin_seconds_in_bucket for 0 dates
Finished calculating factor origin_seconds_in_bucket for 100 dates
Finished calculating factor origin_seconds_in_bucket for 200 dates
Finished calculating factor origin_seconds_in_bucket for 300 dates
Finished calculating factor origin_seconds_in_bucket for 400 dates
Accepted!!: Used 11.21 seconds for calculation factors. The limit is 300 seconds.


  6%|▌         | 1/17 [00:12<03:12, 12.05s/it]

Start calculating factor origin_imbalance_size
Finished calculating factor origin_imbalance_size for 0 dates
Finished calculating factor origin_imbalance_size for 100 dates
Finished calculating factor origin_imbalance_size for 200 dates
Finished calculating factor origin_imbalance_size for 300 dates
Finished calculating factor origin_imbalance_size for 400 dates
Accepted!!: Used 11.56 seconds for calculation factors. The limit is 300 seconds.


 12%|█▏        | 2/17 [00:24<03:07, 12.50s/it]

Start calculating factor origin_imbalance_buy_sell_flag
Finished calculating factor origin_imbalance_buy_sell_flag for 0 dates
Finished calculating factor origin_imbalance_buy_sell_flag for 100 dates
Finished calculating factor origin_imbalance_buy_sell_flag for 200 dates
Finished calculating factor origin_imbalance_buy_sell_flag for 300 dates
Finished calculating factor origin_imbalance_buy_sell_flag for 400 dates
Accepted!!: Used 12.02 seconds for calculation factors. The limit is 300 seconds.


 18%|█▊        | 3/17 [00:38<02:59, 12.85s/it]

Start calculating factor origin_reference_price
Finished calculating factor origin_reference_price for 0 dates
Finished calculating factor origin_reference_price for 100 dates
Finished calculating factor origin_reference_price for 200 dates
Finished calculating factor origin_reference_price for 300 dates
Finished calculating factor origin_reference_price for 400 dates
Accepted!!: Used 10.79 seconds for calculation factors. The limit is 300 seconds.


 24%|██▎       | 4/17 [00:49<02:40, 12.38s/it]

Start calculating factor origin_matched_size
Finished calculating factor origin_matched_size for 0 dates
Finished calculating factor origin_matched_size for 100 dates
Finished calculating factor origin_matched_size for 200 dates
Finished calculating factor origin_matched_size for 300 dates
Finished calculating factor origin_matched_size for 400 dates
Accepted!!: Used 11.40 seconds for calculation factors. The limit is 300 seconds.


 29%|██▉       | 5/17 [01:02<02:29, 12.47s/it]

Start calculating factor origin_far_price
Finished calculating factor origin_far_price for 0 dates
Finished calculating factor origin_far_price for 100 dates
Finished calculating factor origin_far_price for 200 dates
Finished calculating factor origin_far_price for 300 dates
Finished calculating factor origin_far_price for 400 dates
Accepted!!: Used 11.41 seconds for calculation factors. The limit is 300 seconds.


 35%|███▌      | 6/17 [01:15<02:17, 12.53s/it]

Start calculating factor origin_near_price
Finished calculating factor origin_near_price for 0 dates
Finished calculating factor origin_near_price for 100 dates
Finished calculating factor origin_near_price for 200 dates
Finished calculating factor origin_near_price for 300 dates
Finished calculating factor origin_near_price for 400 dates
Accepted!!: Used 14.71 seconds for calculation factors. The limit is 300 seconds.


 41%|████      | 7/17 [01:31<02:16, 13.65s/it]

Start calculating factor origin_bid_price
Finished calculating factor origin_bid_price for 0 dates
Finished calculating factor origin_bid_price for 100 dates
Finished calculating factor origin_bid_price for 200 dates
Finished calculating factor origin_bid_price for 300 dates
Finished calculating factor origin_bid_price for 400 dates
Accepted!!: Used 11.01 seconds for calculation factors. The limit is 300 seconds.


 47%|████▋     | 8/17 [01:42<01:57, 13.07s/it]

Start calculating factor origin_bid_size
Finished calculating factor origin_bid_size for 0 dates
Finished calculating factor origin_bid_size for 100 dates
Finished calculating factor origin_bid_size for 200 dates
Finished calculating factor origin_bid_size for 300 dates
Finished calculating factor origin_bid_size for 400 dates
Accepted!!: Used 11.19 seconds for calculation factors. The limit is 300 seconds.


 53%|█████▎    | 9/17 [01:54<01:41, 12.73s/it]

Start calculating factor origin_ask_price
Finished calculating factor origin_ask_price for 0 dates
Finished calculating factor origin_ask_price for 100 dates
Finished calculating factor origin_ask_price for 200 dates
Finished calculating factor origin_ask_price for 300 dates
Finished calculating factor origin_ask_price for 400 dates
Accepted!!: Used 11.04 seconds for calculation factors. The limit is 300 seconds.


 59%|█████▉    | 10/17 [02:06<01:27, 12.50s/it]

Start calculating factor origin_ask_size
Finished calculating factor origin_ask_size for 0 dates
Finished calculating factor origin_ask_size for 100 dates
Finished calculating factor origin_ask_size for 200 dates
Finished calculating factor origin_ask_size for 300 dates
Finished calculating factor origin_ask_size for 400 dates
Accepted!!: Used 11.81 seconds for calculation factors. The limit is 300 seconds.


 65%|██████▍   | 11/17 [02:20<01:16, 12.72s/it]

Start calculating factor origin_wap
Finished calculating factor origin_wap for 0 dates
Finished calculating factor origin_wap for 100 dates
Finished calculating factor origin_wap for 200 dates
Finished calculating factor origin_wap for 300 dates
Finished calculating factor origin_wap for 400 dates
Accepted!!: Used 11.01 seconds for calculation factors. The limit is 300 seconds.


 71%|███████   | 12/17 [02:31<01:02, 12.46s/it]

Start calculating factor s1_imbalance_signed_pow_opt
Finished calculating factor s1_imbalance_signed_pow_opt for 0 dates


  res = (current_data[:, col2index_map['bid_size']] - current_data[:, col2index_map['ask_size']]) / \


Finished calculating factor s1_imbalance_signed_pow_opt for 100 dates
Finished calculating factor s1_imbalance_signed_pow_opt for 200 dates
Finished calculating factor s1_imbalance_signed_pow_opt for 300 dates
Finished calculating factor s1_imbalance_signed_pow_opt for 400 dates
Accepted!!: Used 16.31 seconds for calculation factors. The limit is 300 seconds.


 76%|███████▋  | 13/17 [02:49<00:55, 13.90s/it]

Start calculating factor stock_20s_ret_deviate_from_index
Finished calculating factor stock_20s_ret_deviate_from_index for 0 dates
Finished calculating factor stock_20s_ret_deviate_from_index for 100 dates
Finished calculating factor stock_20s_ret_deviate_from_index for 200 dates
Finished calculating factor stock_20s_ret_deviate_from_index for 300 dates
Finished calculating factor stock_20s_ret_deviate_from_index for 400 dates
Accepted!!: Used 18.57 seconds for calculation factors. The limit is 300 seconds.


  beta = np.dot(x,y)/np.dot(x,x)


Start calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt
Finished calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt for 0 dates
Finished calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt for 100 dates
Finished calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt for 200 dates
Finished calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt for 300 dates
Finished calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt for 400 dates
Accepted!!: Used 26.24 seconds for calculation factors. The limit is 300 seconds.


 88%|████████▊ | 15/17 [03:35<00:38, 19.05s/it]

Start calculating factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged
Finished calculating factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged for 0 dates
Finished calculating factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged for 100 dates
Finished calculating factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged for 200 dates
Finished calculating factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged for 300 dates
Finished calculating factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged for 400 dates
Accepted!!: Used 16.93 seconds for calculation factors. The limit is 300 seconds.


 94%|█████████▍| 16/17 [03:53<00:18, 18.68s/it]

Start calculating factor price_pressure
Finished calculating factor price_pressure for 0 dates
Finished calculating factor price_pressure for 100 dates
Finished calculating factor price_pressure for 200 dates
Finished calculating factor price_pressure for 300 dates
Finished calculating factor price_pressure for 400 dates
Accepted!!: Used 15.33 seconds for calculation factors. The limit is 300 seconds.


100%|██████████| 17/17 [04:09<00:00, 14.69s/it]


In [323]:
# check if all values in new_factor do not contain nan, no NA is allowed in the factor value
for factor_name, factor_value in new_factors.items():
    assert not np.isnan(factor_value).any(), f'{factor_name} contains nan'
    assert not np.isinf(factor_value).any(), f'{factor_name} contains inf'
    print(f'{factor_name} has correlation to target {np.corrcoef(factor_value, df_train["target"].fillna(0))[0, 1]}')


origin_seconds_in_bucket has correlation to target 0.0031411326809556643
origin_imbalance_size has correlation to target 0.00024120719792637867
origin_imbalance_buy_sell_flag has correlation to target 0.016656949485659357
origin_reference_price has correlation to target 0.01873604844356825
origin_matched_size has correlation to target 0.0008940093903293907
origin_far_price has correlation to target 0.0014903753836927956
origin_near_price has correlation to target 0.005637790885216793
origin_bid_price has correlation to target 0.024357965628517917
origin_bid_size has correlation to target 0.01802849277176966
origin_ask_price has correlation to target 0.022670265167399756
origin_ask_size has correlation to target 0.015096386763995221
origin_wap has correlation to target 0.03702305889673652
s1_imbalance_signed_pow_opt has correlation to target 0.11812660948303615
stock_20s_ret_deviate_from_index has correlation to target 0.08297931229149574
stock_40s_ret_deviate_from_index_rank_hedged_opt

In [324]:
utils.dump_json_factors(new_factors, './ffill_factors.json')

# Produce factor data on testing set

In [328]:
from in_house_validation import utils, factor_design, factor_backtest

In [325]:
df_test = pd.read_csv('./research_test_set.csv')

In [326]:
df_test_dic_sorted = utils.load_json('./in_house_validation/test_data_dic_sorted.json')

In [329]:
new_factors_test = {factor_name: utils.flatten_factor_value(
    factor_design.run_factor_value(df_test_dic_sorted, eval(factor_name), factor_name), factor_name)[factor_name] for
                    factor_name in tqdm(new_factor_list)}


  0%|          | 0/17 [00:00<?, ?it/s][A

Number of dates: 80
Start calculating factor origin_seconds_in_bucket
Finished calculating factor origin_seconds_in_bucket for 480 dates
Accepted!!: Used 5.02 seconds for calculation factors. The limit is 300 seconds.



  6%|▌         | 1/17 [00:05<01:24,  5.28s/it][A

Number of dates: 80
Start calculating factor origin_imbalance_size
Finished calculating factor origin_imbalance_size for 480 dates
Accepted!!: Used 2.83 seconds for calculation factors. The limit is 300 seconds.



 12%|█▏        | 2/17 [00:08<01:00,  4.02s/it][A

Number of dates: 80
Start calculating factor origin_imbalance_buy_sell_flag
Finished calculating factor origin_imbalance_buy_sell_flag for 480 dates
Accepted!!: Used 2.96 seconds for calculation factors. The limit is 300 seconds.



 18%|█▊        | 3/17 [00:11<00:51,  3.67s/it][A

Number of dates: 80
Start calculating factor origin_reference_price
Finished calculating factor origin_reference_price for 480 dates
Accepted!!: Used 2.90 seconds for calculation factors. The limit is 300 seconds.



 24%|██▎       | 4/17 [00:14<00:44,  3.46s/it][A

Number of dates: 80
Start calculating factor origin_matched_size
Finished calculating factor origin_matched_size for 480 dates
Accepted!!: Used 2.57 seconds for calculation factors. The limit is 300 seconds.



 29%|██▉       | 5/17 [00:17<00:38,  3.21s/it][A

Number of dates: 80
Start calculating factor origin_far_price
Finished calculating factor origin_far_price for 480 dates
Accepted!!: Used 2.72 seconds for calculation factors. The limit is 300 seconds.



 35%|███▌      | 6/17 [00:20<00:34,  3.13s/it][A

Number of dates: 80
Start calculating factor origin_near_price
Finished calculating factor origin_near_price for 480 dates
Accepted!!: Used 3.96 seconds for calculation factors. The limit is 300 seconds.



 41%|████      | 7/17 [00:24<00:35,  3.52s/it][A

Number of dates: 80
Start calculating factor origin_bid_price



 47%|████▋     | 8/17 [00:28<00:31,  3.52s/it][A

Finished calculating factor origin_bid_price for 480 dates
Accepted!!: Used 3.31 seconds for calculation factors. The limit is 300 seconds.
Number of dates: 80
Start calculating factor origin_bid_size
Finished calculating factor origin_bid_size for 480 dates
Accepted!!: Used 2.40 seconds for calculation factors. The limit is 300 seconds.



 53%|█████▎    | 9/17 [00:30<00:25,  3.23s/it][A

Number of dates: 80
Start calculating factor origin_ask_price
Finished calculating factor origin_ask_price for 480 dates
Accepted!!: Used 2.67 seconds for calculation factors. The limit is 300 seconds.



 59%|█████▉    | 10/17 [00:33<00:21,  3.13s/it][A

Number of dates: 80
Start calculating factor origin_ask_size
Finished calculating factor origin_ask_size for 480 dates
Accepted!!: Used 3.32 seconds for calculation factors. The limit is 300 seconds.



 65%|██████▍   | 11/17 [00:37<00:19,  3.29s/it][A

Number of dates: 80
Start calculating factor origin_wap
Finished calculating factor origin_wap for 480 dates
Accepted!!: Used 4.35 seconds for calculation factors. The limit is 300 seconds.



 71%|███████   | 12/17 [00:42<00:18,  3.71s/it][A

Number of dates: 80
Start calculating factor s1_imbalance_signed_pow_opt


  res = (current_data[:, col2index_map['bid_size']] - current_data[:, col2index_map['ask_size']]) / \


Finished calculating factor s1_imbalance_signed_pow_opt for 480 dates
Accepted!!: Used 5.34 seconds for calculation factors. The limit is 300 seconds.



 76%|███████▋  | 13/17 [00:47<00:17,  4.28s/it][A

Number of dates: 80
Start calculating factor stock_20s_ret_deviate_from_index
Finished calculating factor stock_20s_ret_deviate_from_index for 480 dates
Accepted!!: Used 7.07 seconds for calculation factors. The limit is 300 seconds.



  beta = np.dot(x,y)/np.dot(x,x)


Number of dates: 80
Start calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt
Finished calculating factor stock_40s_ret_deviate_from_index_rank_hedged_opt for 480 dates
Accepted!!: Used 6.33 seconds for calculation factors. The limit is 300 seconds.



 88%|████████▊ | 15/17 [01:01<00:11,  5.60s/it][A

Number of dates: 80
Start calculating factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged
Finished calculating factor triplet_imbalance_bid_ask_wap_signed_pow_opt_hedged for 480 dates
Accepted!!: Used 3.88 seconds for calculation factors. The limit is 300 seconds.



 94%|█████████▍| 16/17 [01:05<00:05,  5.14s/it][A

Number of dates: 80
Start calculating factor price_pressure



100%|██████████| 17/17 [01:09<00:00,  4.07s/it][A

Finished calculating factor price_pressure for 480 dates
Accepted!!: Used 3.32 seconds for calculation factors. The limit is 300 seconds.





In [330]:
utils.dump_json_factors(new_factors_test, './ffill_factors_test.json')

ml