In [1]:
import numpy as np
import pandas as pd
import sys

path='/home/jbohn/jupyter/personal/Kernel_Learning/'
sys.path.append(path)

from Features.clean_data import clean_quotes
from Features.feature_preprocessing import generate_features_from_quotes
from LOB_Analysis.batch_data import batch_solve_mkl, batch_features
from LOB_Analysis.batch_kernel import train_svm_batch, predict_svm_batch



### Fetch Raw Quote Data

- Quotes for AAPL across 13 exchanges that offer U.S. equities
- Data corresponds to Jan 2020

In [2]:
quotes=pd.read_csv(path+'data/AAPL_quotes.csv')
quotes

KeyboardInterrupt: 

### Clean & Preprocessing

- Drops market quotes outside of open hours
- Criteria for invalid quotes (inverted spread, zero price or volume)
- Standardizes Participant Timestamp as time index

In [3]:
cleaned_quotes=clean_quotes(quotes)

### Feature Generation

- Generates set of features off quote data 
- Performs time aggregation to 60 seconds 
- Labels outcome; price direction relative to next interval

In [4]:
### paul : more features in generate_features_from_quotes

In [5]:
labelled_data=generate_features_from_quotes(cleaned_quotes,save=True)
labelled_data

Unnamed: 0_level_0,Exchange,Symbol,Best_Bid_Price,FB0,Best_Offer_Price,FA0,FB2,FA2,p_time,Next_Best_Bid,Next_Best_Offer,outcome
last_interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-01-02 09:30:00,K,AAPL,296.21,1.0,296.29,1.0,-1.0,0.0,2020-01-02 09:30:00.134336,295.71,295.78,-1
2020-01-02 09:31:00,Q,AAPL,295.71,1.0,295.78,2.0,0.0,0.0,2020-01-02 09:31:00.000691,295.49,295.52,-1
2020-01-02 09:32:00,Z,AAPL,295.49,3.0,295.52,2.0,0.0,1.0,2020-01-02 09:32:00.019756,295.68,295.73,1
2020-01-02 09:33:00,Z,AAPL,295.68,1.0,295.73,2.0,-1.0,0.0,2020-01-02 09:33:00.000277,296.53,296.60,1
2020-01-02 09:34:00,J,AAPL,296.53,3.0,296.60,4.0,0.0,1.0,2020-01-02 09:34:00.114073,296.90,296.93,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-09 15:55:00,Q,AAPL,309.45,2.0,309.48,2.0,0.0,1.0,2020-01-09 15:55:00.002485,309.54,309.57,1
2020-01-09 15:56:00,Q,AAPL,309.54,1.0,309.57,4.0,0.0,0.0,2020-01-09 15:56:00.026630,309.60,309.61,1
2020-01-09 15:57:00,Z,AAPL,309.60,2.0,309.61,1.0,1.0,-3.0,2020-01-09 15:57:00.000373,309.47,309.50,-1
2020-01-09 15:58:00,Q,AAPL,309.47,5.0,309.50,10.0,0.0,-18.0,2020-01-09 15:58:00.006826,309.47,309.49,0


In [2]:
labelled_data=pd.read_csv(path+'data/labeled_data.csv')

In [3]:
features=labelled_data[['FB0','FA0','FB2','FA2']]
outcomes=labelled_data['outcome']

features

Unnamed: 0,FB0,FA0,FB2,FA2
0,1.0,1.0,-1.0,0.0
1,1.0,2.0,0.0,0.0
2,3.0,2.0,0.0,1.0
3,1.0,2.0,-1.0,0.0
4,3.0,4.0,0.0,1.0
...,...,...,...,...
3213,2.0,2.0,0.0,1.0
3214,1.0,4.0,0.0,0.0
3215,2.0,1.0,1.0,-3.0
3216,5.0,10.0,0.0,-18.0


### Batch Features

- For computational feasiblity will retrain model off batched data looking to evaluate performance in subsequent interval
- Batches features and outcomes in size batch_size which model will be trained from

In [4]:
batch_size=100

batch_data=batch_features(features,outcomes,batch_size)
batch_data

{0.0: {'last_interval': 0,
  'features':     FB0  FA0  FB2  FA2
  0   1.0  1.0 -1.0  0.0
  1   1.0  2.0  0.0  0.0
  2   3.0  2.0  0.0  1.0
  3   1.0  2.0 -1.0  0.0
  4   3.0  4.0  0.0  1.0
  ..  ...  ...  ...  ...
  95  5.0  5.0  0.0  1.0
  96  5.0  1.0  1.0  0.0
  97  2.0  2.0 -1.0  0.0
  98  2.0  1.0  0.0 -1.0
  99  1.0  2.0 -8.0  0.0
  
  [100 rows x 4 columns],
  'outcomes': 0    -1
  1    -1
  2     1
  3     1
  4     1
       ..
  95   -1
  96    1
  97    0
  98    1
  99    1
  Name: outcome, Length: 100, dtype: int64},
 1.0: {'last_interval': 100,
  'features':       FB0  FA0  FB2  FA2
  100   5.0  1.0  0.0 -1.0
  101   5.0  3.0  0.0  2.0
  102   2.0  1.0  1.0 -1.0
  103   1.0  8.0  0.0  1.0
  104   1.0  1.0 -1.0  0.0
  ..    ...  ...  ...  ...
  195  11.0  3.0  0.0  1.0
  196   1.0  2.0  0.0 -3.0
  197   1.0  2.0  0.0  1.0
  198   2.0  1.0 -5.0 -3.0
  199   3.0  1.0  1.0  0.0
  
  [100 rows x 4 columns],
  'outcomes': 100    0
  101    1
  102    1
  103   -1
  104    1
    

### Train Single Kernel across batched dataset

In [5]:
#### Jarryd : single kernel training functionality in Kernels/single_kernel.py

### MKL Training across batched data 


In [6]:
#batched_estimates=batch_solve_mkl(features,outcomes,3,100,'polynomial',3,verbose=False)

In [7]:


batched_estimates=batch_solve_mkl(features,outcomes,5,60,'gaussian',5,verbose=False)

Batch  0 Last Interval 0 complete with weights  [1. 0. 0. 0. 0.]
Batch  60 Last Interval 60 complete with weights  [1. 0. 0. 0. 0.]
Batch  120 Last Interval 120 complete with weights  [0.28386033 0.21169737 0.18724725 0.167312   0.14988304]
Batch  180 Last Interval 180 complete with weights  [0.         0.22668081 0.24756692 0.25925366 0.26649861]
Batch  240 Last Interval 240 complete with weights  [0.93657084 0.06342916 0.         0.         0.        ]
Batch  300 Last Interval 300 complete with weights  [0.19228772 0.19410417 0.19913151 0.20463958 0.20983703]
Batch  360 Last Interval 360 complete with weights  [0.         0.22038004 0.24361844 0.26143424 0.27456728]
Batch  420 Last Interval 420 complete with weights  [0.         0.23652505 0.25088032 0.2555408  0.25705383]
Batch  480 Last Interval 480 complete with weights  [0.         0.22000433 0.24317917 0.26145632 0.27536018]
Batch  540 Last Interval 540 complete with weights  [0.         0.22599757 0.2469962  0.25971994 0.267286

In [8]:
batched_estimates

{0.0: array([1., 0., 0., 0., 0.]),
 1.0: array([1., 0., 0., 0., 0.]),
 2.0: array([0.28386033, 0.21169737, 0.18724725, 0.167312  , 0.14988304]),
 3.0: array([0.        , 0.22668081, 0.24756692, 0.25925366, 0.26649861]),
 4.0: array([0.93657084, 0.06342916, 0.        , 0.        , 0.        ]),
 5.0: array([0.19228772, 0.19410417, 0.19913151, 0.20463958, 0.20983703]),
 6.0: array([0.        , 0.22038004, 0.24361844, 0.26143424, 0.27456728]),
 7.0: array([0.        , 0.23652505, 0.25088032, 0.2555408 , 0.25705383]),
 8.0: array([0.        , 0.22000433, 0.24317917, 0.26145632, 0.27536018]),
 9.0: array([0.        , 0.22599757, 0.2469962 , 0.25971994, 0.26728629]),
 10.0: array([1., 0., 0., 0., 0.]),
 11.0: array([0.39385248, 0.19914623, 0.15823779, 0.13302731, 0.1157362 ]),
 12.0: array([0.        , 0.22354877, 0.24497343, 0.26004074, 0.27143707]),
 13.0: array([1., 0., 0., 0., 0.]),
 14.0: array([0.91391959, 0.08608041, 0.        , 0.        , 0.        ]),
 15.0: array([0.        , 0.22

In [9]:
batch_data[1]

{'last_interval': 100,
 'features':       FB0  FA0  FB2  FA2
 100   5.0  1.0  0.0 -1.0
 101   5.0  3.0  0.0  2.0
 102   2.0  1.0  1.0 -1.0
 103   1.0  8.0  0.0  1.0
 104   1.0  1.0 -1.0  0.0
 ..    ...  ...  ...  ...
 195  11.0  3.0  0.0  1.0
 196   1.0  2.0  0.0 -3.0
 197   1.0  2.0  0.0  1.0
 198   2.0  1.0 -5.0 -3.0
 199   3.0  1.0  1.0  0.0
 
 [100 rows x 4 columns],
 'outcomes': 100    0
 101    1
 102    1
 103   -1
 104    1
       ..
 195   -1
 196   -1
 197   -1
 198    1
 199    1
 Name: outcome, Length: 100, dtype: int64}

In [10]:
batch_svm,batch_index=train_svm_batch(batch_data,batched_estimates,'gaussian',5)

In [11]:
batch_svm

{0.0: SVC(kernel='precomputed'),
 1.0: SVC(kernel='precomputed'),
 2.0: SVC(kernel='precomputed'),
 3.0: SVC(kernel='precomputed'),
 4.0: SVC(kernel='precomputed'),
 5.0: SVC(kernel='precomputed'),
 6.0: SVC(kernel='precomputed'),
 7.0: SVC(kernel='precomputed'),
 8.0: SVC(kernel='precomputed'),
 9.0: SVC(kernel='precomputed'),
 10.0: SVC(kernel='precomputed'),
 11.0: SVC(kernel='precomputed'),
 12.0: SVC(kernel='precomputed'),
 13.0: SVC(kernel='precomputed'),
 14.0: SVC(kernel='precomputed'),
 15.0: SVC(kernel='precomputed'),
 16.0: SVC(kernel='precomputed'),
 17.0: SVC(kernel='precomputed'),
 18.0: SVC(kernel='precomputed'),
 19.0: SVC(kernel='precomputed'),
 20.0: SVC(kernel='precomputed'),
 21.0: SVC(kernel='precomputed'),
 22.0: SVC(kernel='precomputed'),
 23.0: SVC(kernel='precomputed'),
 24.0: SVC(kernel='precomputed'),
 25.0: SVC(kernel='precomputed'),
 26.0: SVC(kernel='precomputed'),
 27.0: SVC(kernel='precomputed'),
 28.0: SVC(kernel='precomputed'),
 29.0: SVC(kernel='preco

In [12]:
evaluation_dict=predict_svm_batch(batch_svm,batch_index,batch_data,batched_estimates,'gaussian',5)

In [13]:
evaluation_dict

{0.0: {'accuracy': 0.43, 'recall': 0.43},
 1.0: {'accuracy': 0.47, 'recall': 0.47},
 2.0: {'accuracy': 0.28, 'recall': 0.28},
 3.0: {'accuracy': 0.35, 'recall': 0.35},
 4.0: {'accuracy': 0.25, 'recall': 0.25},
 5.0: {'accuracy': 0.35, 'recall': 0.35},
 6.0: {'accuracy': 0.39, 'recall': 0.39},
 7.0: {'accuracy': 0.39, 'recall': 0.39},
 8.0: {'accuracy': 0.39, 'recall': 0.39},
 9.0: {'accuracy': 0.41, 'recall': 0.41},
 10.0: {'accuracy': 0.25, 'recall': 0.25},
 11.0: {'accuracy': 0.3, 'recall': 0.3},
 12.0: {'accuracy': 0.33, 'recall': 0.33},
 13.0: {'accuracy': 0.35, 'recall': 0.35},
 14.0: {'accuracy': 0.3, 'recall': 0.3},
 15.0: {'accuracy': 0.31, 'recall': 0.31},
 16.0: {'accuracy': 0.29, 'recall': 0.29},
 17.0: {'accuracy': 0.34, 'recall': 0.34},
 18.0: {'accuracy': 0.38, 'recall': 0.38},
 19.0: {'accuracy': 0.32, 'recall': 0.32},
 20.0: {'accuracy': 0.33, 'recall': 0.33},
 21.0: {'accuracy': 0.21, 'recall': 0.21},
 22.0: {'accuracy': 0.34, 'recall': 0.34},
 23.0: {'accuracy': 0.4, 