### Maximization of the Log-Likelihood of Hidden Markov Models on the Limit Order Book

In [1]:
import pandas as pd
import numpy as np
from hmmlearn.hmm import GaussianHMM
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt



In [70]:
features=pd.read_csv('data/features.csv',index_col=0,nrows=50000)

def remove_duplicates(series):
    
    cleaned_series=series[np.insert(np.diff(series).astype(bool), 0, True)]
    dropped_els=len(series)-len(cleaned_series)
    
    print(f"Dropped {dropped_els} consecutive repeated values from input series")
    return cleaned_series

bidsize=remove_duplicates(features['Bid_Size'].values)
offersize=remove_duplicates(features['Offer_Size'].values)
bookimbalance=remove_duplicates(features['OB_IB'].values)
spread=remove_duplicates(features['spread'].values)

# formatted as numpy float 
np.savetxt(r'psg_example_hmm/vector_bidsize.txt', bidsize)
np.savetxt(r'psg_example_hmm/vector_offersize.txt', offersize)
np.savetxt(r'psg_example_hmm/vector_bookimbalance.txt', bookimbalance)
np.savetxt(r'psg_example_hmm/vector_spread.txt', spread)

features


Dropped 21612 consecutive repeated values from input series
Dropped 18853 consecutive repeated values from input series
Dropped 9378 consecutive repeated values from input series
Dropped 13603 consecutive repeated values from input series


Unnamed: 0,Bid_Size,Offer_Size,spread,OB_IB
2020-01-02 09:30:00.134062,0.000000,0.000000,0.342176,0.430041
2020-01-02 09:30:00.134336,0.000000,0.000000,0.330141,0.430041
2020-01-02 09:30:00.134532,0.000000,0.091578,0.372143,0.482240
2020-01-02 09:30:00.136081,0.000000,0.000000,0.336317,0.430041
2020-01-02 09:30:00.234474,0.000000,0.000000,0.380608,0.430041
...,...,...,...,...
2020-01-02 09:57:39.650647,0.000000,0.212638,0.158344,0.551244
2020-01-02 09:57:39.650839,0.000000,0.183157,0.158344,0.534440
2020-01-02 09:57:39.650865,0.121383,0.000000,0.204657,0.377841
2020-01-02 09:57:39.651071,0.000000,0.091578,0.158344,0.482240


### PSG

- Formatting as a valid input type for PSG function hmm_discrete(x,0)
- States x
    - Spread Widens
    - Spread Narrows

- Observations o
    - Will need to pass one feature vector in at a time 
    -   4 x m matrix of features

In [56]:
import os
os.add_dll_directory('C:\Aorda\PSG\lib')
import psgpython as psg 
from psg_loader import load_psg

In [4]:
load_psg()

In [71]:
psg_prob = psg.psg_importfromtext('./psg_example_hmm/problem_hmm_discrete.txt')

OK. Problem Imported



In [72]:
psg_prob['problem_statement'] = '\n'.join(psg_prob['problem_statement'])

In [73]:
solution=psg.psg_solver(psg_prob)

Running solver
Reading problem formulation
Asking for data information
Getting data
    100.0% of scenarios is processed
100% of vector_bookimbalance was read
Start optimization
Ext.iteration=0  Objective=0.521629177915E+00  Residual=0.000000000000E+00
Ext.iteration=10  Objective=0.521629177915E+00  Residual=0.000000000000E+00
Optimization is stopped
Solution is optimal
Calculating resulting outputs. Writing solution.
Objective: objective = 52191.9456035 [-1.042246337158E+17]
Solver has normally finished. Solution was saved.
Problem: problem_hmm_normal, solution_status = optimal
Timing: data_loading_time = 0.09, preprocessing_time = 26.62, solving_time = 0.65
Variables: optimal_point = point_problem_hmm_normal
Objective: objective = 52191.9456035 [-1.042246337158E+17]
Constraint: sum_of_probabilities_for_states = vector_sum_of_probabilities_for_states
Function: hmm_normal(2,vector_bookimbalance) =  5.219194560347E+04
OK. Solver Finished



In [74]:
solution.values()

dict_values(['problem_hmm_normal', 'optimal', ['problem_HMM_Normal, maximize', '  hmm_normal(2,vector_bookimbalance)', '  Solver: VAN'], ['Problem: problem_hmm_normal, solution_status = optimal', 'Timing: data_loading_time = 0.09, preprocessing_time = 26.62, solving_time = 0.65', 'Variables: optimal_point = point_problem_hmm_normal', 'Objective: objective = 52191.9456035 [-1.042246337158E+17]', 'Constraint: sum_of_probabilities_for_states = vector_sum_of_probabilities_for_states', 'Function: hmm_normal(2,vector_bookimbalance) =  5.219194560347E+04'], [['p1', 'p2', 'a1_1', 'a1_2', 'a2_1', 'a2_2', 'mu1', 'si1', 'mu2', 'si2'], array([0.        , 1.        , 0.89950989, 0.10049011, 0.01105765,
       0.98894235, 0.4419428 , 0.13609657, 0.43252718, 0.05953872])], array([2., 2., 2., ..., 2., 2., 2.]), array([1., 1., 1.]), array([0.00000000e+00, 1.75415238e-14, 1.93178806e-14]), [['state1', 'state2'], array([[0.        , 1.        ],
       [0.00135759, 0.99864241],
       [0.00195491, 0.9980

### HMM Learn

- Train HMM on one feature at a time
- Assume each feature is normally distributed 


In [22]:
tscv = TimeSeriesSplit(n_splits=5)


counter=0
for train_index, test_index in tscv.split(features):
    
    X_train, X_test = features.iloc[train_index].values, features.iloc[test_index].values
    y_train, y_test = outcomes['spread_state'].iloc[train_index].values, outcomes['spread_state'].iloc[test_index].values
    print(y_train)
    spread_states=3

    spread_model=GaussianHMM(n_components=spread_states,covariance_type='diag',startprob_prior=1.0, transmat_prior=1.0, algorithm='viterbi',params='stmc', init_params='stmc')
    spread_model.startprob_ = X_train[0]
    fitted_spread=spread_model.fit(X_train,y_train)
    log_prob=fitted_spread.score(X_test,y_test)
    print(f"Out of Sample Log Probability is {log_prob}")
    break

Even though the 'startprob_' attribute is set, it will be overwritten during initialization because 'init_params' contains 's'


[ 0 -1  1 -1  1  1  0  1 -1 -1  0  1  1 -1 -1  1  1  1 -1  1 -1  1  0  0
  1 -1  0  0  1 -1 -1  1  1 -1 -1  1 -1  1 -1  0  0  0 -1  1  1 -1  1 -1
 -1  1  1  1 -1  1 -1  0 -1  1 -1  0  0  0  0  0  0  0  1 -1  1 -1 -1  1
  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  1 -1  0  0  0  1 -1  0  1 -1  0  0  0  0  0  0  1 -1  0  1 -1
 -1  1  1  0  0 -1  0  0  0  1  0 -1  1 -1  0  1 -1  0  0  1 -1  1  1 -1
  1  1 -1  1 -1  1 -1  0  1 -1  1 -1  0  1 -1  1  1  0 -1  1  0  0 -1  1
  0  1]


  1 -1  0  0  1 -1 -1  1  1 -1 -1  1 -1  1 -1  0  0  0 -1  1  1 -1  1 -1
 -1  1  1  1 -1  1 -1  0 -1  1 -1  0  0  0  0  0  0  0  1 -1  1 -1 -1  1
  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  1 -1  0  0  0  1 -1  0  1 -1  0  0  0  0  0  0  1 -1  0  1 -1
 -1  1  1  0  0 -1  0  0  0  1  0 -1  1 -1  0  1 -1  0  0  1 -1  1  1 -1
  1  1 -1  1 -1  1 -1  0  1 -1  1 -1  0  1 -1  1  1  0 -1  1  0  0 -1  1
  0  1]; support for silently dropping samples is deprecated and will be removed
  fitted_spread=spread_model.fit(X_train,y_train)


IndexError: index 0 is out of bounds for axis 0 with size 0