In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from factor_design_ver2_beta import utils
from factor_design_ver2_beta import factor_design
from factor_design_ver2_beta import factor_backtest
from joblib import Parallel, delayed
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
df_train = pd.read_csv('research_train_set.csv')  # this is needed for the factor design

In [4]:
col2index_map = utils.load_json('./factor_design_ver2_beta/col2index_map.json')
col2index_map

{'stock_id': 0,
 'date_id': 1,
 'seconds_in_bucket': 2,
 'imbalance_size': 3,
 'imbalance_buy_sell_flag': 4,
 'reference_price': 5,
 'matched_size': 6,
 'far_price': 7,
 'near_price': 8,
 'bid_price': 9,
 'bid_size': 10,
 'ask_price': 11,
 'ask_size': 12,
 'wap': 13,
 'time_id': 14,
 'row_id': 15}

In [5]:
# count the number of nan in each column of df_train
df_train.isna().sum()

stock_id                         0
date_id                          0
seconds_in_bucket                0
imbalance_size                 165
imbalance_buy_sell_flag          0
reference_price                165
matched_size                   165
far_price                  2408871
near_price                 2377155
bid_price                      165
bid_size                         0
ask_price                      165
ask_size                         0
wap                            165
target                          87
time_id                          0
row_id                           0
dtype: int64

# build all original features into base factors

In [6]:
def origin_seconds_in_bucket(current_data: dict, hist_list: list) -> np.ndarray:
    current_data = np.array(list(current_data.values()), dtype=float).T
    current_data = np.nan_to_num(current_data)
    return current_data[:, col2index_map['seconds_in_bucket']]


def origin_imbalance_size(current_data: dict, hist_list: list) -> np.ndarray:
    current_data = np.array(list(current_data.values()), dtype=float).T
    current_data = np.nan_to_num(current_data)
    return current_data[:, col2index_map['imbalance_size']]


def origin_imbalance_buy_sell_flag(current_data: dict, hist_list: list) -> np.ndarray:
    current_data = np.array(list(current_data.values()), dtype=float).T
    current_data = np.nan_to_num(current_data)
    return current_data[:, col2index_map['imbalance_buy_sell_flag']]


def origin_reference_price(current_data: dict, hist_list: list) -> np.ndarray:
    current_data = np.array(list(current_data.values()), dtype=float).T
    current_data = np.nan_to_num(current_data)
    return current_data[:, col2index_map['reference_price']]


def origin_matched_size(current_data: dict, hist_list: list) -> np.ndarray:
    current_data = np.array(list(current_data.values()), dtype=float).T
    current_data = np.nan_to_num(current_data)
    return current_data[:, col2index_map['matched_size']]


def origin_far_price(current_data: dict, hist_list: list) -> np.ndarray:
    current_data = np.array(list(current_data.values()), dtype=float).T
    current_data = np.nan_to_num(current_data)
    return current_data[:, col2index_map['far_price']]


def origin_near_price(current_data: dict, hist_list: list) -> np.ndarray:
    current_data = np.array(list(current_data.values()), dtype=float).T
    current_data = np.nan_to_num(current_data)
    return current_data[:, col2index_map['near_price']]


def origin_bid_price(current_data: dict, hist_list: list) -> np.ndarray:
    current_data = np.array(list(current_data.values()), dtype=float).T
    current_data = np.nan_to_num(current_data)
    return current_data[:, col2index_map['bid_price']]


def origin_bid_size(current_data: dict, hist_list: list) -> np.ndarray:
    current_data = np.array(list(current_data.values()), dtype=float).T
    current_data = np.nan_to_num(current_data)
    return current_data[:, col2index_map['bid_size']]


def origin_ask_price(current_data: dict, hist_list: list) -> np.ndarray:
    current_data = np.array(list(current_data.values()), dtype=float).T
    current_data = np.nan_to_num(current_data)
    return current_data[:, col2index_map['ask_price']]


def origin_ask_size(current_data: dict, hist_list: list) -> np.ndarray:
    current_data = np.array(list(current_data.values()), dtype=float).T
    current_data = np.nan_to_num(current_data)
    return current_data[:, col2index_map['ask_size']]


def origin_wap(current_data: dict, hist_list: list) -> np.ndarray:
    current_data = np.array(list(current_data.values()), dtype=float).T
    current_data = np.nan_to_num(current_data)
    return current_data[:, col2index_map['wap']]

In [7]:
original_factor_name_list = [f'origin_{col}' for col in col2index_map.keys() if '_id' not in col]
original_factor_dict = {factor_name: utils.flatten_factor_value(factor_design.run_factor_value(df_train, eval(factor_name)), factor_name)[factor_name] for factor_name in tqdm(original_factor_name_list)}# this takes 4min

  0%|          | 0/12 [00:00<?, ?it/s]

Start reading df
Finish reading df
Start calculating factors
Accepted!!: Used 4.01 seconds for calculation factors. The limit is 300 seconds.


  8%|▊         | 1/12 [00:19<03:34, 19.48s/it]

Start reading df
Finish reading df
Start calculating factors
Accepted!!: Used 4.12 seconds for calculation factors. The limit is 300 seconds.


 17%|█▋        | 2/12 [00:38<03:13, 19.34s/it]

Start reading df
Finish reading df
Start calculating factors
Accepted!!: Used 3.97 seconds for calculation factors. The limit is 300 seconds.


 25%|██▌       | 3/12 [00:58<02:54, 19.41s/it]

Start reading df
Finish reading df
Start calculating factors
Accepted!!: Used 4.01 seconds for calculation factors. The limit is 300 seconds.


 33%|███▎      | 4/12 [01:17<02:34, 19.31s/it]

Start reading df
Finish reading df
Start calculating factors
Accepted!!: Used 4.01 seconds for calculation factors. The limit is 300 seconds.


 42%|████▏     | 5/12 [01:36<02:13, 19.12s/it]

Start reading df
Finish reading df
Start calculating factors
Accepted!!: Used 3.95 seconds for calculation factors. The limit is 300 seconds.


 50%|█████     | 6/12 [01:55<01:54, 19.06s/it]

Start reading df
Finish reading df
Start calculating factors
Accepted!!: Used 4.01 seconds for calculation factors. The limit is 300 seconds.


 58%|█████▊    | 7/12 [02:14<01:35, 19.01s/it]

Start reading df
Finish reading df
Start calculating factors
Accepted!!: Used 4.32 seconds for calculation factors. The limit is 300 seconds.


 67%|██████▋   | 8/12 [02:33<01:17, 19.28s/it]

Start reading df
Finish reading df
Start calculating factors
Accepted!!: Used 4.97 seconds for calculation factors. The limit is 300 seconds.


 75%|███████▌  | 9/12 [02:56<01:01, 20.39s/it]

Start reading df
Finish reading df
Start calculating factors
Accepted!!: Used 4.23 seconds for calculation factors. The limit is 300 seconds.


 83%|████████▎ | 10/12 [03:18<00:41, 20.75s/it]

Start reading df
Finish reading df
Start calculating factors
Accepted!!: Used 4.11 seconds for calculation factors. The limit is 300 seconds.


 92%|█████████▏| 11/12 [03:38<00:20, 20.46s/it]

Start reading df
Finish reading df
Start calculating factors
Accepted!!: Used 4.14 seconds for calculation factors. The limit is 300 seconds.


100%|██████████| 12/12 [03:57<00:00, 19.83s/it]


# Mid-price as a simple factor for testing

In [8]:
def mid_price(current_data: dict, hist_list: list) -> np.ndarray:
    '''
    This will be the main function to design your factors for the competition. Please
    define only one factor here each time. We provide you with:

    Current_data: a dictionary in the format of {column_name: column_value}, where column_name is from the original
    dataframe

    hist_list: A list for you to save the previous factor values (optional). For instance,
    if you are calculating a 100-day Moving Average (MA), then you can save the first calculated
    MA in hist_list, and then for the next MA calculation, you can use the saved ones.
    '''
    ###################### ADD YOUR CODE HERE FOR FACTORS DESIGN ######################
    # convert the current_data to your choice of numpy or pandas dataframe
    # current_data = pd.DataFrame(current_data)
    current_data = np.array(list(current_data.values()),
                            dtype=float).T  # this is faster than pd.DataFrame(current_data).values
    res = current_data[:, col2index_map['ask_price']] - current_data[:, col2index_map['bid_price']]
    ############################## NAN/Inf handling ######################################
    # if you have nan in your factor value, please fill it reasonably
    # res = np.nan_to_num(res) # this is slow because it also checks for inf.
    # res = np.where(np.isnan(res), 0, res)  # this is slightly faster than np.nan_to_num
    res[np.isnan(res)] = 0  # this is the fastest way to fill nan with 0
    ############################## END OF YOUR CODE ##############################
    return res  # The return value MUSE BE a numpy array, with no NaN value
    ####################################################################################


In [9]:
def s1_imbalance(current_data: dict, hist_list: list) -> np.ndarray:
    '''
    This will be the main function to design your factors for the competition. Please
    define only one factor here each time. We provide you with:

    Current_data: a dictionary in the format of {column_name: column_value}, where column_name is from the original
    dataframe

    hist_list: A list for you to save the previous factor values (optional). For instance,
    if you are calculating a 100-day Moving Average (MA), then you can save the first calculated
    MA in hist_list, and then for the next MA calculation, you can use the saved ones.
    '''
    ###################### ADD YOUR CODE HERE FOR FACTORS DESIGN ######################
    # convert the current_data to your choice of numpy or pandas dataframe
    # current_data = pd.DataFrame(current_data)
    current_data = np.array(list(current_data.values()),
                            dtype=float).T  # this is faster than pd.DataFrame(current_data).values
    res = (current_data[:, col2index_map['bid_size']] - current_data[:, col2index_map['ask_size']])/\
          (current_data[:, col2index_map['bid_size']] + current_data[:, col2index_map['ask_size']])
    ############################## NAN/Inf handling ######################################
    # if you have nan in your factor value, please fill it reasonably
    # res = np.nan_to_num(res) # this is slow because it also checks for inf.
    # res = np.where(np.isnan(res), 0, res)  # this is slightly faster than np.nan_to_num
    res[np.isnan(res)] = 0  # this is the fastest way to fill nan
    ############################## END OF YOUR CODE ##############################
    return res  # The return value MUSE BE a numpy array, with no NaN value
    ####################################################################################


In [10]:
new_factor_list = ['mid_price', 's1_imbalance']

In [11]:
new_factor= {factor_name: utils.flatten_factor_value(factor_design.run_factor_value(df_train, eval(factor_name)), factor_name)[factor_name] for factor_name in tqdm(new_factor_list)}

  0%|          | 0/2 [00:00<?, ?it/s]

Start reading df
Finish reading df
Start calculating factors
Accepted!!: Used 3.37 seconds for calculation factors. The limit is 300 seconds.


 50%|█████     | 1/2 [00:18<00:18, 18.86s/it]

Start reading df
Finish reading df
Start calculating factors


  res = (current_data[:, col2index_map['bid_size']] - current_data[:, col2index_map['ask_size']])/\


Accepted!!: Used 4.04 seconds for calculation factors. The limit is 300 seconds.


100%|██████████| 2/2 [00:38<00:00, 19.30s/it]


# Dump json files

In [22]:
utils.dump_json(original_factor_dict, 'original_factor_dict.json')
utils.dump_json(new_factor, 'new_factor.json')

In [1]:
import json
with open('original_factor_dict.json', 'r') as f:
    original_factor_dict = json.load(f)

In [5]:
original_factor_dict.keys()

dict_keys(['origin_seconds_in_bucket', 'origin_imbalance_size', 'origin_imbalance_buy_sell_flag', 'origin_reference_price', 'origin_matched_size', 'origin_far_price', 'origin_near_price', 'origin_bid_price', 'origin_bid_size', 'origin_ask_price', 'origin_ask_size', 'origin_wap'])

In [7]:
len(original_factor_dict['origin_imbalance_size'])

4357980

In [8]:
original_factor_dict['origin_imbalance_size'][:10]

[3180602.69,
 166603.91,
 302879.87,
 11917682.27,
 447549.96,
 0.0,
 969969.4,
 9412959.1,
 2394875.85,
 3039700.65]

# Demo backtest

In [12]:
new_factor.keys()

dict_keys(['mid_price', 's1_imbalance'])

In [13]:
original_factor_dict.keys()

dict_keys(['origin_seconds_in_bucket', 'origin_imbalance_size', 'origin_imbalance_buy_sell_flag', 'origin_reference_price', 'origin_matched_size', 'origin_far_price', 'origin_near_price', 'origin_bid_price', 'origin_bid_size', 'origin_ask_price', 'origin_ask_size', 'origin_wap'])

In [15]:
# check if all values in new_factor and original_factor_dict does not contain nan
for factor_name, factor_value in new_factor.items():
    assert not np.isnan(factor_value).any(), f'{factor_name} contains nan'

In [17]:
demo_backtest = factor_backtest.Factor_Backtest(original_factor_dict, new_factor, {})

In [19]:
demo_backtest.run_testing('train')

!xiaban 
{'mid_price_train': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 's1_imbalance_train': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
!xiaban 
{'mid_price_train': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 's1_imbalance_train': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
