### Maximization of the Log-Likelihood of Hidden Markov Models on the Limit Order Book

In [2]:
import pandas as pd
import numpy as np
from hmmlearn.hmm import GaussianHMM
from sklearn.model_selection import TimeSeriesSplit

In [52]:

features=pd.read_csv('data/features.csv',index_col=0,nrows=1000)


bidsize=features['Bid_Size'].values
offersize=features['Offer_Size'].values
bookimbalance=features['OB_IB'].values
spread=features['spread'].values

np.savetxt(r'psg_example_hmm/vector_bidsize.txt', bidsize, fmt='%d')
np.savetxt(r'psg_example_hmm/vector_offersize.txt', offersize, fmt='%d')
np.savetxt(r'psg_example_hmm/vector_bookimbalance.txt', bookimbalance, fmt='%d')
np.savetxt(r'psg_example_hmm/vector_spread.txt', spread, fmt='%d')


In [43]:
outcomes=pd.read_csv('data/outcomes.csv',index_col=0,nrows=1000)
outcomes

Unnamed: 0,spread_state
2020-01-02 09:30:00.134062,0
2020-01-02 09:30:00.134336,-1
2020-01-02 09:30:00.134532,1
2020-01-02 09:30:00.136081,-1
2020-01-02 09:30:00.234474,1
...,...
2020-01-02 09:30:28.784959,0
2020-01-02 09:30:28.819508,1
2020-01-02 09:30:28.832168,1
2020-01-02 09:30:29.169166,-1


### PSG

- Formatting as a valid input type for PSG function hmm_discrete(x,0)
- States x
    - Spread Widens
    - Spread Narrows

- Observations o
    - Will need to pass one feature vector in at a time 
    -   4 x m matrix of features

In [44]:
import os
os.add_dll_directory('C:\Aorda\PSG\lib')
import psgpython as psg 
from psg_loader import load_psg

In [45]:
load_psg()

In [47]:

np.savetxt(r'psg_example_hmm/vector_features.txt', features, fmt='%d')

In [60]:
psg_prob = psg.psg_importfromtext('./psg_example_hmm/problem_hmm_discrete.txt')

OK. Problem Imported



In [61]:
psg_prob['problem_statement'] = '\n'.join(psg_prob['problem_statement'])

In [62]:
solution=psg.psg_solver(psg_prob)

Running solver
Reading problem formulation
Asking for data information
Getting data
    100.0% of scenarios is processed
100% of vector_spread was read
Start optimization
Ext.iteration=0  Objective=0.568036977844E+00  Residual=0.000000000000E+00
Ext.iteration=10  Objective=0.568036977844E+00  Residual=0.000000000000E+00
Optimization is stopped
Solution is optimal
Calculating resulting outputs. Writing solution.
Objective: objective = 5770.57785658 [-1.058209031288E+16]
Solver has normally finished. Solution was saved.
Problem: problem_hmm_discrete, solution_status = optimal
Timing: data_loading_time = 0.06, preprocessing_time = 0.38, solving_time = 0.01
Variables: optimal_point = point_problem_hmm_discrete
Objective: objective = 5770.57785658 [-1.058209031288E+16]
Constraint: sum_of_probabilities_for_states = vector_sum_of_probabilities_for_states
Function: hmm_normal(2,vector_spread) =  5.770577856582E+03
OK. Solver Finished



In [63]:
solution.values()

dict_values(['problem_hmm_discrete', 'optimal', ['problem_HMM_Discrete, maximize', '  hmm_normal(2,vector_spread)'], ['Problem: problem_hmm_discrete, solution_status = optimal', 'Timing: data_loading_time = 0.06, preprocessing_time = 0.38, solving_time = 0.01', 'Variables: optimal_point = point_problem_hmm_discrete', 'Objective: objective = 5770.57785658 [-1.058209031288E+16]', 'Constraint: sum_of_probabilities_for_states = vector_sum_of_probabilities_for_states', 'Function: hmm_normal(2,vector_spread) =  5.770577856582E+03'], [['p1', 'p2', 'a1_1', 'a1_2', 'a2_1', 'a2_2', 'mu1', 'si1', 'mu2', 'si2'], array([1.00000000e+00, 2.54674643e-10, 9.14132588e-01, 8.58674120e-02,
       7.83722387e-01, 2.16277613e-01, 0.00000000e+00, 3.31882978e-04,
       3.42639689e+00, 5.84481337e+00])], array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 2.,

### HMM Learn

- Multinomial, Gaussian, Gaussian Mixture States
- Thinking will do Gaussian and transform features to continuous values
- Easier for optimziation and comparison between libraies 

In [22]:
tscv = TimeSeriesSplit(n_splits=5)


counter=0
for train_index, test_index in tscv.split(features):
    
    X_train, X_test = features.iloc[train_index].values, features.iloc[test_index].values
    y_train, y_test = outcomes['spread_state'].iloc[train_index].values, outcomes['spread_state'].iloc[test_index].values
    print(y_train)
    spread_states=3

    spread_model=GaussianHMM(n_components=spread_states,covariance_type='diag',startprob_prior=1.0, transmat_prior=1.0, algorithm='viterbi',params='stmc', init_params='stmc')
    spread_model.startprob_ = X_train[0]
    fitted_spread=spread_model.fit(X_train,y_train)
    log_prob=fitted_spread.score(X_test,y_test)
    print(f"Out of Sample Log Probability is {log_prob}")
    break

Even though the 'startprob_' attribute is set, it will be overwritten during initialization because 'init_params' contains 's'


[ 0 -1  1 -1  1  1  0  1 -1 -1  0  1  1 -1 -1  1  1  1 -1  1 -1  1  0  0
  1 -1  0  0  1 -1 -1  1  1 -1 -1  1 -1  1 -1  0  0  0 -1  1  1 -1  1 -1
 -1  1  1  1 -1  1 -1  0 -1  1 -1  0  0  0  0  0  0  0  1 -1  1 -1 -1  1
  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  1 -1  0  0  0  1 -1  0  1 -1  0  0  0  0  0  0  1 -1  0  1 -1
 -1  1  1  0  0 -1  0  0  0  1  0 -1  1 -1  0  1 -1  0  0  1 -1  1  1 -1
  1  1 -1  1 -1  1 -1  0  1 -1  1 -1  0  1 -1  1  1  0 -1  1  0  0 -1  1
  0  1]


  1 -1  0  0  1 -1 -1  1  1 -1 -1  1 -1  1 -1  0  0  0 -1  1  1 -1  1 -1
 -1  1  1  1 -1  1 -1  0 -1  1 -1  0  0  0  0  0  0  0  1 -1  1 -1 -1  1
  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  1 -1  0  0  0  1 -1  0  1 -1  0  0  0  0  0  0  1 -1  0  1 -1
 -1  1  1  0  0 -1  0  0  0  1  0 -1  1 -1  0  1 -1  0  0  1 -1  1  1 -1
  1  1 -1  1 -1  1 -1  0  1 -1  1 -1  0  1 -1  1  1  0 -1  1  0  0 -1  1
  0  1]; support for silently dropping samples is deprecated and will be removed
  fitted_spread=spread_model.fit(X_train,y_train)


IndexError: index 0 is out of bounds for axis 0 with size 0