In [1]:
import numpy as np
import pandas as pd
import sys

path='/home/jbohn/jupyter/personal/Kernel_Learning/'
sys.path.append(path)
from Kernels.mkl_solver import primal_dual_opt
from Features.clean_data import clean_quotes
from Features.feature_preprocessing import generate_features_from_quotes
from LOB_Analysis.batch_data import batch_solve_mkl, batch_features



### Fetch Raw Quote Data

- Quotes for AAPL across 13 exchanges that offer U.S. equities
- Data corresponds to Jan 2020

In [6]:
quotes=pd.read_csv(path+'data/AAPL_quotes.csv')
quotes

  quotes=pd.read_csv(path+'data/AAPL_quotes.csv')


Unnamed: 0,Time,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Quote_Condition,Sequence_Number,FINRA_BBO_Indicator,...,Best_Offer_Size,Best_Offer_FINRA_Market_Maker_ID,LULD_Indicator,LULD_NBBO_Indicator,SIP_Generated_Message_Identifier,Participant_Timestamp,FINRA_ADF_Timestamp,Security_Status_Indicator,Quote_Cancel_Correction,National_BBO_Ind
0,2020-01-02 04:00:00.065165,P,AAPL,278.00,7.0,0.00,0.0,R,2228,,...,0.0,,,,,40000064785664,,,,2
1,2020-01-02 04:00:00.065167,P,AAPL,278.00,14.0,0.00,0.0,R,2229,,...,0.0,,,,,40000064787456,,,,2
2,2020-01-02 04:00:00.065170,P,AAPL,293.72,9.0,0.00,0.0,R,2230,,...,0.0,,,,,40000064790784,,,,2
3,2020-01-02 04:00:32.677788,P,AAPL,295.08,10.0,295.81,1.0,R,2676,,...,1.0,,,,,40032677415424,,,,2
4,2020-01-02 04:00:36.260761,Q,AAPL,295.02,4.0,300.00,1.0,R,2696,,...,1.0,,,,,40036260727444,,,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925268,2020-01-09 19:59:47.657672,K,AAPL,310.38,20.0,310.40,7.0,R,33431302,,...,151.0,,,,,195947657433000,,,,2
2925269,2020-01-09 19:59:48.734101,Q,AAPL,310.25,1.0,310.40,141.0,R,33431303,,...,141.0,,,,,195948734083107,,,,2
2925270,2020-01-09 19:59:50.192454,K,AAPL,310.38,21.0,310.40,7.0,R,33431396,,...,141.0,,,,,195950192198000,,,,2
2925271,2020-01-09 19:59:53.972452,Q,AAPL,310.25,1.0,310.40,140.0,R,33431420,,...,140.0,,,,,195953972434327,,,,2


### Clean & Preprocessing

- Drops market quotes outside of open hours
- Criteria for invalid quotes (inverted spread, zero price or volume)
- Standardizes Participant Timestamp as time index

In [7]:
cleaned_quotes=clean_quotes(quotes)

### Feature Generation

- Generates set of features off quote data 
- Performs time aggregation to 60 seconds 
- Labels outcome; price direction relative to next interval

In [None]:
### paul : more features in generate_features_from_quotes

In [11]:
labelled_data=generate_features_from_quotes(cleaned_quotes,save=True)
labelled_data

Unnamed: 0_level_0,Exchange,Symbol,Best_Bid_Price,FB0,Best_Offer_Price,FA0,FB2,FA2,p_time,Next_Best_Bid,Next_Best_Offer,outcome
last_interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-01-02 09:30:00,K,AAPL,296.21,1.0,296.29,1.0,-1.0,0.0,2020-01-02 09:30:00.134336,295.71,295.78,-1
2020-01-02 09:31:00,Q,AAPL,295.71,1.0,295.78,2.0,0.0,0.0,2020-01-02 09:31:00.000691,295.49,295.52,-1
2020-01-02 09:32:00,Z,AAPL,295.49,3.0,295.52,2.0,0.0,1.0,2020-01-02 09:32:00.019756,295.68,295.73,1
2020-01-02 09:33:00,Z,AAPL,295.68,1.0,295.73,2.0,-1.0,0.0,2020-01-02 09:33:00.000277,296.53,296.60,1
2020-01-02 09:34:00,J,AAPL,296.53,3.0,296.60,4.0,0.0,1.0,2020-01-02 09:34:00.114073,296.90,296.93,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-09 15:55:00,Q,AAPL,309.45,2.0,309.48,2.0,0.0,1.0,2020-01-09 15:55:00.002485,309.54,309.57,1
2020-01-09 15:56:00,Q,AAPL,309.54,1.0,309.57,4.0,0.0,0.0,2020-01-09 15:56:00.026630,309.60,309.61,1
2020-01-09 15:57:00,Z,AAPL,309.60,2.0,309.61,1.0,1.0,-3.0,2020-01-09 15:57:00.000373,309.47,309.50,-1
2020-01-09 15:58:00,Q,AAPL,309.47,5.0,309.50,10.0,0.0,-18.0,2020-01-09 15:58:00.006826,309.47,309.49,0


In [3]:
labelled_data=pd.read_csv(path+'data/labeled_data.csv')

In [4]:
features=labelled_data[['FB0','FA0','FB2','FA2']]
outcomes=labelled_data['outcome']

features

Unnamed: 0,FB0,FA0,FB2,FA2
0,1.0,1.0,-1.0,0.0
1,1.0,2.0,0.0,0.0
2,3.0,2.0,0.0,1.0
3,1.0,2.0,-1.0,0.0
4,3.0,4.0,0.0,1.0
...,...,...,...,...
385,5.0,1.0,0.0,0.0
386,6.0,2.0,0.0,-2.0
387,1.0,3.0,0.0,0.0
388,3.0,5.0,0.0,-16.0


### Batch Features

- For computational feasiblity will retrain model off batched data looking to evaluate performance in subsequent interval
- Batches features and outcomes in size batch_size which model will be trained from

In [5]:
batch_size=100
batch_data=batch_features(features,outcomes,batch_size)

### Train Single Kernel across batched dataset

In [15]:
#### Jarryd : single kernel training functionality in Kernels/single_kernel.py

### MKL Training across batched data 


In [16]:
#batched_estimates=batch_solve_mkl(features,outcomes,3,100,'polynomial',3,verbose=False)

Batch  0 Last Interval 2020-01-02 09:30:00 complete with weights  [0.33333333 0.33333333 0.33333333]
Batch  100 Last Interval 2020-01-02 11:10:00 complete with weights  [0.33333333 0.33333333 0.33333333]
Batch  200 Last Interval 2020-01-02 12:50:00 complete with weights  [0.33333333 0.33333333 0.33333333]
Batch  300 Last Interval 2020-01-02 14:30:00 complete with weights  [0.33333333 0.33333333 0.33333333]
Batch  400 Last Interval 2020-01-02 16:10:00 complete with weights  [0. 0. 1.]
Batch  500 Last Interval 2020-01-02 18:25:00 complete with weights  [0. 0. 1.]
Batch  600 Last Interval 2020-01-03 10:05:00 complete with weights  [0.33333333 0.33333333 0.33333333]
Batch  700 Last Interval 2020-01-03 11:45:00 complete with weights  [0.33333333 0.33333333 0.33333333]
Batch  800 Last Interval 2020-01-03 13:25:00 complete with weights  [0.33333333 0.33333333 0.33333333]
Batch  900 Last Interval 2020-01-03 15:05:00 complete with weights  [0.         0.33699696 0.66300304]
Batch  1000 Last Int

In [6]:
batched_estimates=batch_solve_mkl(features,outcomes,5,60,'gaussian',5,verbose=False)

Batch  0 Last Interval 0 complete with weights  [1. 0. 0. 0. 0.]
Batch  60 Last Interval 60 complete with weights  [1. 0. 0. 0. 0.]
Batch  120 Last Interval 120 complete with weights  [0.28386033 0.21169737 0.18724725 0.167312   0.14988304]
Batch  180 Last Interval 180 complete with weights  [0.         0.22668081 0.24756692 0.25925366 0.26649861]
Batch  240 Last Interval 240 complete with weights  [0.93657084 0.06342916 0.         0.         0.        ]
Batch  300 Last Interval 300 complete with weights  [0.19228772 0.19410417 0.19913151 0.20463958 0.20983703]
Batch  360 Last Interval 360 complete with weights  [1. 0. 0. 0. 0.]
