In [164]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

# Separating first 400 date_id as train set and the rest as test set

In [140]:
# data = pd.read_csv('train.csv')

In [141]:
# data['date_id'].unique()

In [142]:
# data['date_id'].value_counts().sort_index()# i.e. some earlier date_id have different value count than later date_id

In [143]:
# train_set, test_set = data[data['date_id']<=400], data[data['date_id']>400]

In [144]:
# train_set.to_csv('research_train_set.csv', index=False)
# train_set.to_csv('research_test_set.csv', index=False)

# Read in train and test set

In [145]:
train_set = pd.read_csv('research_train_set.csv')
test_set = pd.read_csv('research_test_set.csv')

In [146]:
print(len(train_set)/(len(train_set)+len(test_set)), len(test_set)/(len(train_set)+len(test_set)))# truely a 50% split

0.5 0.5


In [147]:
train_set.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


# Factor Ideas:
1. volatility weighted synthetic index: Target computation have its own synthetic index computation component.
2. historical beta: correlation between stock and index
3. AR1: target usually bonce between two periods. 
4. volatility: historical volatility of the stock will continue
5. order book imbalance shift around: how many times order books imbalance changes, indicates disagreement between buyers and sellers.
6. order book imbalance shift magnitude: small difference shift doesn't matter if the magnitude is small.
7. seconds_in_bucket: how stocks always perform in the same 10 seconds -> indicates possible algorithmic trading.
8. seconds_in_bucket volatility: historically how volatile the stock is in the same 10 seconds.
9. maximum bonce in x seconds: give the algo a feeling of how normal range should be
10. near price - far price: the difference of the market running auction or the market running auction + continous market trading.
11. (near price-far price)/ (far price* second in bucket): the percentage of the market running auction or the market running auction + continous market trading. indicator of market inefficiency. adjust for seconds in bucket.
12. time 0 and time 540 might be special: time 0: people may not be ready to trade, but algos are. time 540: last chance to trade.
13. spoofing: "fake orders" posted design to build order imbalance, so that it encourages the buy/sell of the stock such that it goes in the direction of the spoofer's order.
variance of the order book imbalance suggests spoofing. since we cannot cancel after 3:50, we can only do spoofing with both side large order that cancel each other out.

14. Rough fill probability: the probability for the best bid-ask price being filled. 
t=0, bid-ask price = 100-101. bid-ask vol = 1000-1000 
t=1, bid-ask price = 100-101. bid-ask vol = 500-500
then rough fill probability for bid-ask = 0.5 0.5
15. stock agreeableness: if the matching size is large relative to the order book imbalance size (compare to the cross-sectional median), then the buyer and seller agree on the price 
16. volume sitting on the order book (less best-bid-and-ask) = imbalance size - bid_size-ask_size
16. wap sitting on the order book: because wap is weighted average price for all order sitting on the order book,so we can extract parts that is not the best bid and ask price
17. best slot1 imbalance size: same calculation as imbalance size
18. difference in imbalance size: (imbalance_size-matched_size)/(matched_size+imbalance_size)
19. <s>spread: best ask price - best bid price. indicate the trading cost of crossing the spread.</s>
20. <s>spread difference: spread at time t - spread at time t-1. indicate the volatility of the spread.</s>
20. <s>spread volatility_ts: std of spread starting from first 5 observations of the day</s>
21. deviation of price from the closing price (closing price is the price at second_in_basket 0)
22. <s>volume imbalance at best-bid-and-ask: (bid_size-ask_size)/(bid_size+ask_size)</s>
23. MOC flag: nasdaq don't accept moc order after 3:55. i.e. second in bucket >= 300
23. longer memory: correlation with certain window.

In [148]:
def spread(df):
    """
    spread at the first slot
    :param df: training set (fixed for every factor) 
    :return: np.ndarray
    """
    res = df['ask_price']-df['bid_price']
    return res.values

In [149]:
def spread_diff(df):
    """
    spread difference between two observations
    :param df: training set (fixed for every factor) 
    :return: np.ndarray
    """
    res = df.loc[:,['stock_id','date_id','seconds_in_bucket']]
    res['spread'] = df['ask_price']-df['bid_price']
    res = res.groupby(['stock_id','date_id'])['spread'].diff().fillna(0)
    return res.values

In [150]:
def spread_vol_5(df):
    """
    spread volatility rolling 5 observations
    :param df: training set (fixed for every factor) 
    :return: np.ndarray
    """
    res = df.loc[:,['stock_id','date_id','seconds_in_bucket']]
    res['spread'] = df['ask_price']-df['bid_price']
    return res.groupby(['stock_id','date_id'])['spread'].rolling(5).std().reset_index().sort_values('level_2')['spread'].values# this sort value is important because it preserves the original order of the data

In [151]:
def s1_vol_imbalance(df):
    """
    volume imbalance at the first slot
    :param df: training set (fixed for every factor) 
    :return: np.ndarray
    """
    res = (df['bid_size']-df['ask_size'])/(df['bid_size']+df['ask_size'])
    return res.values

In [171]:
def MOC_flag(df):
    res = df['seconds_in_bucket'].apply(lambda x: 1 if x>=300 else 0)
    return res.values

# Call all factors

In [174]:
original_factor = ['seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
        'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap',]#10 of them. far price and near price are excluded because 80% nan
factor_name = ['spread','spread_diff','spread_vol_5','s1_vol_imbalance','MOC_flag']

In [175]:
factors = {}

In [176]:
for factor in original_factor:
    factors[factor] = train_set[factor].values

In [177]:
for factor in factor_name:
    factors[factor] = eval(factor)(train_set)

In [178]:
factors = (np.vstack(list(factors.values())))

In [179]:
factors.shape

(15, 4357980)

# Fit a linear regression model 
to fit regular models we need to exclude nan value

In [192]:
import plotly.express as px

In [199]:
df = pd.DataFrame(np.vstack([factors,train_set['target'].values]).T, columns=original_factor+factor_name+['target']).dropna()
# correlation heatmap
fig = px.imshow(abs(df.corr()),text_auto=True,aspect='auto',)
fig.update_layout(title='Correlation Heatmap')
fig.show()

In [181]:
abs(df.corr().iloc[-1,:-1]).sort_values()

11    0.000425
4     0.001193
0     0.001582
1     0.001608
14    0.002718
12    0.006383
10    0.007368
8     0.014186
6     0.017088
2     0.018200
3     0.021240
7     0.024406
5     0.025995
9     0.036649
13    0.111832
Name: 15, dtype: float64

In [182]:
res = test_set.loc[:,['stock_id','date_id','seconds_in_bucket']]
res['spread'] = test_set['ask_price']-test_set['bid_price']

In [183]:
res.groupby(['stock_id','date_id'])['spread'].rolling(5).std().reset_index().sort_values('level_2')['spread'].values

array([           nan,            nan,            nan, ...,
       5.45912081e-05, 0.00000000e+00, 0.00000000e+00])

In [184]:
res

Unnamed: 0,stock_id,date_id,seconds_in_bucket,spread
0,0,0,0,0.000214
1,1,0,0,0.000764
2,2,0,0,0.000895
3,3,0,0,0.000215
4,4,0,0,0.000622
...,...,...,...,...
4357975,195,400,540,0.000111
4357976,196,400,540,0.000263
4357977,197,400,540,0.000101
4357978,198,400,540,0.000279


In [185]:
test_set

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.50,1.000026,8493.03,1.000000,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.000660,20605.09,1.000000,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.00,1.000298,18995.00,1.000000,-8.389950,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.90,1.000214,479032.40,1.000000,-4.010200,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.10,1.000000,-7.349849,0,0_0_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4357975,195,400,540,970377.20,-1,0.998844,22995348.39,0.998620,0.998620,0.998844,99960.00,0.998955,78370.28,0.998906,1.640320,22054,400_540_195
4357976,196,400,540,67450.10,-1,0.998683,6428925.92,0.998420,0.998420,0.998683,36887.40,0.998946,123977.36,0.998743,-3.939867,22054,400_540_196
4357977,197,400,540,376153.58,1,0.999465,14505279.32,0.999866,0.999766,0.999364,20413.90,0.999465,31470.44,0.999404,1.289845,22054,400_540_197
4357978,198,400,540,183655.04,-1,0.999938,59839618.44,0.999938,0.999938,0.999938,562673.79,1.000217,579856.56,1.000076,-7.489920,22054,400_540_198


In [186]:
res

Unnamed: 0,stock_id,date_id,seconds_in_bucket,spread
0,0,0,0,0.000214
1,1,0,0,0.000764
2,2,0,0,0.000895
3,3,0,0,0.000215
4,4,0,0,0.000622
...,...,...,...,...
4357975,195,400,540,0.000111
4357976,196,400,540,0.000263
4357977,197,400,540,0.000101
4357978,198,400,540,0.000279


In [187]:
res[(res['stock_id']==0)&(res['date_id']==0)]['spread'].rolling(5).std()

0                 NaN
191               NaN
382               NaN
573               NaN
764      5.860631e-05
955      5.860631e-05
1146     4.785185e-05
1337     4.785185e-05
1528     4.774202e-05
1719     4.774202e-05
1910     4.774202e-05
2101     4.774202e-05
2292     5.842517e-05
2483     5.860631e-05
2674     5.860631e-05
2865     5.860631e-05
3056     5.860631e-05
3247     5.879031e-05
3438     5.888124e-05
3629     4.829907e-05
3820     4.829907e-05
4011     5.888124e-05
4202     5.860631e-05
4393     5.860631e-05
4584     5.860631e-05
4775     4.796561e-05
4966     5.879031e-05
5157     5.888124e-05
5348     8.958348e-05
5539     1.789860e-04
5730     1.789860e-04
5921     1.789860e-04
6112     1.621752e-04
6303     1.757891e-04
6494     5.879031e-05
6685     5.860631e-05
6876     5.860631e-05
7067     4.785185e-05
7258     4.785185e-05
7449     2.803251e-12
7640     2.803251e-12
7831     2.803251e-12
8022     2.803251e-12
8213     2.803251e-12
8404     2.803251e-12
8595     2