## Preprocessing 

In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
'''
Welcome to Optiver 2023 MoC Challenge! We designed this notebook for you to explore the data given by the challenge, research and design your own factors based on
the training data given and the helper functions we built for you!
Feel free to play around with the data and get familiar with it!
'''

# We will import the necessary libraries here. You can import any librariese you need for your beautiful factors!
from out_of_sample import utils
from out_of_sample import factor_design
from out_of_sample import factor_backtest
import numpy as np
import pandas as pd
from tqdm import tqdm

# Load data

In [5]:
col2index_map = utils.load_json('./out_of_sample/col2index_map.json')

In [None]:
# We designed a function to transform the df_train you loaded into a dictionary to speed up the research process.
# You don't need to worry about this. This will take up to half minutes to run. DON'T RE-RUN THIS CELL TOO OFTEN!
df_train_dic_sorted = utils.load_json('./out_of_sample/df_train_dic_sorted.json')
df_test_dic_sorted = utils.load_json('./out_of_sample/test_data_dic_sorted.json')

In [None]:
utils.add_factor_to_existed()

In [None]:
existed_factor_train = utils.load_json('./existed_factors.json')
existed_factor_test = utils.load_json('./existed_factor_test.json')

In [None]:
factor_performance_train = utils.load_json('./out_of_sample/factor_performance.json')
factor_performance_test = utils.load_json('./out_of_sample/factor_performance_test.json')

# New factor here

In [None]:
def s1_imbalance_signed_pow_opt(current_data: dict, hist_list=[]) -> np.ndarray:
    '''
    This will be the main function to design your factors for the competition. Please
    define only one factor here each time. We provide you with:

    Current_data: a dictionary in the format of {column_name: column_value}, where column_name is from the original
    dataframe

    hist_list: A list for you to save the previous factor values (optional). For instance,
    if you are calculating a 100-day Moving Average (MA), then you can save the first calculated
    MA in hist_list, and then for the next MA calculation, you can use the saved ones.
    '''
    ###################### ADD YOUR CODE HERE FOR FACTORS DESIGN ######################
    # convert the current_data to your choice of numpy or pandas dataframe
    # current_data = pd.DataFrame(current_data)
    current_data = np.array(list(current_data.values()),
                            dtype=float).T  # this is faster than pd.DataFrame(current_data).values
    res = (current_data[:, col2index_map['bid_size']] - current_data[:, col2index_map['ask_size']]) / \
          (current_data[:, col2index_map['bid_size']] + current_data[:, col2index_map['ask_size']])
    ############################## NAN/Inf handling ######################################
    # if you have nan in your factor value, please fill it reasonably
    # res = np.nan_to_num(res) # this is slow because it also checks for inf.
    # res = np.where(np.isnan(res), 0, res)  # this is slightly faster than np.nan_to_num
    res[np.isnan(res)] = 0  # this is the fastest way to fill nan
    ############################## END OF YOUR CODE ##############################
    return np.sign(-res) * np.abs(
        res) ** 1.4242424242424243  # The return value MUSE BE a numpy array, with no NaN value
    # reason: s1_imbalance have most information in sign, sign can be think of as sign*abs**0. we optimize the power and get 1.424 as the best
    ####################################################################################


# Some Examples of Factor Design

In [None]:
'''
Each factor should be defined as a function described above. After you designed all your factors and you are all good to test your factors,
you can simply ADD your factor name to the factor_list below, and run the cell. The backtest result will be printed out for you to see!
'''
# toned factor
new_factor_list = ['s1_imbalance_signed_pow_opt']  # fine toned factor set


In [None]:
# DO NOT MODIFY THE FOLLOWING CODE
# Run this cell once you want to calculate your factor values and prepare for the test of your factor performance!
new_factors_train = {factor_name: utils.flatten_factor_value(
    factor_design.run_factor_value(df_train_dic_sorted, eval(factor_name), factor_name), factor_name)[factor_name] for
               factor_name in tqdm(new_factor_list)}

In [None]:
df_train = pd.read_csv('research_train_set.csv')
df_test = pd.read_csv('research_test_set.csv')

In [None]:
# DO NOT MODIFY THE FOLLOWING CODE
# Run this cell once you want to calculate your factor values and prepare for the test of your factor performance!
new_factors_test = {factor_name: utils.flatten_factor_value(
    factor_design.run_factor_value(df_test_dic_sorted, eval(factor_name), factor_name), factor_name)[factor_name] for
               factor_name in tqdm(new_factor_list)}

## Factor Backtesting

In [None]:
'''
The factor_backtest is an object that can be used to backtest your factors. It takes three arguments:

    existed_facors: This is the dictionary stored all the passed factors with their names as keys and values as values
    testing_factors: This is the dictionary stored all the factors waited to be tested
    factor_performance: This is the dictionary stored all the factors' performance score (which is the Pearson correlation coefficient
    of factor values vs corresponded target vector)
'''
# We will firstly load the existed factors and the existed factors' performance for you
# This will take 12 seconds, you only need to run this cell once for loading the existed factors and their performance
# existed_factors = utils.load_json_factors('./factor_design_ver3_alpha/existed_factors.json')
# factor_performance = utils.load_json('./factor_design_ver3_alpha/factor_performance.json')

In [None]:
demo_backtest = factor_backtest.Factor_Backtest(existed_factors=existed_factor_train, testing_factors=new_factors_train,
                                                factor_performance=factor_performance_train)

In [None]:
demo_backtest.run_testing()  # this will print out the in-sample performance of your factors

In [None]:
demo_backtest.run_out_of_sample('s1_imbalance_signed_pow_opt', new_factors_test['s1_imbalance_signed_pow_opt'],df_test['target'].fillna(0))

In [None]:
from datetime import datetime

In [None]:
today=str(datetime.now().year)+str(datetime.now().month)+str(datetime.now().day)

In [None]:
today

In [None]:
utils.dump_json_factors(new_factors_train,f'./out_of_sample/temp{today}.json')

# locally update new factors

In [None]:
utils.add_factor_to_existed()
