In [1]:
import numpy as np
import pandas as pd
from hmmlearn.hmm import GaussianHMM
from datetime import date,timedelta


import os
os.add_dll_directory('C:\Aorda\PSG\lib')
import psgpython as psg 
from psg_loader import load_psg


### Script to iteratively Fit HMM model 
- Method parameter allows one to fit with PSG or HMMLearn
- Utilizes all potential features

In [15]:

def remove_duplicates(series):
    """ Remove identical consecutive observations
    """
    cleaned_series=series[np.insert(np.diff(series).astype(bool), 0, True)]
    dropped_els=len(series)-len(cleaned_series)
    
    return cleaned_series


def prep_features(dt):
    """ Prep features by accessing grouped feature csv 
    """
    grouped_features=pd.read_csv(f'data/agg_features/grouped_features_{dt}.csv')
    bidsize=remove_duplicates(grouped_features['Bid_Size'].values)
    offersize=remove_duplicates(grouped_features['Offer_Size'].values)
    bookimbalance=remove_duplicates(grouped_features['OB_IB'].values)
    spread=remove_duplicates(grouped_features['spread'].values)

    feature_dict=dict(zip(['Bid_Size','Offer_Size','OB_IB','spread'],[bidsize,offersize,bookimbalance,spread]))

    return feature_dict

def extract_params(param_dict,method):
    """ Extract Params from optimized model 
    """
    if method==1:
        param_df=pd.DataFrame.from_dict(param_dict,orient='index',columns=['p1','p2','a11','a12', 'a21', 'a22','mu1','si1','mu2','si2'])
        param_df=param_df.drop(columns=['p1','p2'])
        return param_df

    elif method==2:
        param_df=pd.DataFrame.from_dict(param_dict,orient='index')
        means= pd.DataFrame(param_df['Mean'].to_list(), columns = ['mu1', 'mu2'])
        covar=pd.DataFrame(param_df['Sigma'].to_list(), columns = ['sigma1', 'sigma2'])
        transit=pd.DataFrame(param_df['Transition'].to_list(), columns = ['a11', 'a12','a21','a22'])
        dates=pd.DataFrame(param_df.index,columns=['date'])
        
        new_param_df=pd.DataFrame(pd.concat([means,covar,transit,dates],axis=1))
        new_param_df.index=new_param_df['date'].values
        new_param_df=new_param_df.drop(columns=['date'])
        return new_param_df


def fit_hmm(method):
    """ Fit HMM model with PSG and HMMLearn 
    """
    start=date(2020,1,1)
    days=[start+timedelta(days=i) for i in range(0,30)]

    spread_params={}
    bidsize_params={}
    offersize_params={}
    bookimbalance_params={}
    
    # psg training
    if method==1:
        for dt in days:
            try:
                dt_features=prep_features(dt)
            except:
                continue
            
            # formatted as numpy float 
            np.savetxt(r'psg_text_hmm/vector_bidsize.txt', dt_features['Bid_Size'])
            np.savetxt(r'psg_text_hmm/vector_offersize.txt', dt_features['Offer_Size'])
            np.savetxt(r'psg_text_hmm/vector_bookimbalance.txt', dt_features['OB_IB'])
            np.savetxt(r'psg_text_hmm/vector_spread.txt', dt_features['spread'])


            psg_spread_prob = psg.psg_importfromtext('./psg_text_hmm/problem_hmm_normal_spread.txt')
            psg_spread_prob['problem_statement'] = '\n'.join(psg_spread_prob['problem_statement'])
            spread_solution=psg.psg_solver(psg_spread_prob)
            params=list(spread_solution.values())[4][1]
            spread_params[dt]=params

            psg_bidsize_prob = psg.psg_importfromtext('./psg_text_hmm/problem_hmm_normal_bidsize.txt')
            psg_bidsize_prob['problem_statement'] = '\n'.join(psg_bidsize_prob['problem_statement'])
            bidsize_solution=psg.psg_solver(psg_bidsize_prob)
            params=list(bidsize_solution.values())[4][1]
            bidsize_params[dt]=params

            psg_offersize_prob = psg.psg_importfromtext('./psg_text_hmm/problem_hmm_normal_offersize.txt')
            psg_offersize_prob['problem_statement'] = '\n'.join(psg_offersize_prob['problem_statement'])
            offersize_solution=psg.psg_solver(psg_offersize_prob)
            params=list(offersize_solution.values())[4][1]
            offersize_params[dt]=params

            psg_bookimbalance_prob = psg.psg_importfromtext('./psg_text_hmm/problem_hmm_normal_bookimbalance.txt')
            psg_bookimbalance_prob['problem_statement'] = '\n'.join(psg_bookimbalance_prob['problem_statement'])
            bookimbalance_solution=psg.psg_solver(psg_bookimbalance_prob)
            params=list(bookimbalance_solution.values())[4][1]
            bookimbalance_params[dt]=params
            
    elif method==2:
        for dt in days:
            
            try:
                dt_features=prep_features(dt)
            except:
                continue
            print(f"Fitting HMM usign HMM-Learn for {dt}")
            
            
            spread=dt_features['spread'].reshape(-1, 1)
            spread_model=GaussianHMM(n_components=2,algorithm='viterbi',covariance_type="spherical",min_covar=1e-4, n_iter=1000,tol=1e-8)
            fitted_spread_model=spread_model.fit(spread)
            spread_mu=fitted_spread_model.means_.flatten()
            spread_covar=fitted_spread_model.covars_.flatten()
            spread_transit=fitted_spread_model.transmat_.flatten()
            spread_params[dt]={"Mean":spread_mu, "Sigma":spread_covar,"Transition":spread_transit}

            bidsize=dt_features['Bid_Size'].reshape(-1, 1)
            bidsize_model=GaussianHMM(n_components=2,algorithm='viterbi',covariance_type="spherical",min_covar=1e-4, n_iter=1000,tol=1e-8)
            fitted_bidsize_model=bidsize_model.fit(bidsize)
            bidsize_mu=fitted_bidsize_model.means_.flatten()
            bidsize_covar=fitted_bidsize_model.covars_.flatten()
            bidsize_transit=fitted_bidsize_model.transmat_.flatten()
            bidsize_params[dt]={"Mean":bidsize_mu, "Sigma":bidsize_covar,"Transition":bidsize_transit}

            offersize=dt_features['Offer_Size'].reshape(-1, 1)
            offersize_model=GaussianHMM(n_components=2,algorithm='viterbi',covariance_type="spherical",min_covar=1e-4, n_iter=1000,tol=1e-8)
            fitted_offersize_model=offersize_model.fit(offersize)
            offersize_mu=fitted_offersize_model.means_.flatten()
            offersize_covar=fitted_offersize_model.covars_.flatten()
            offersize_transit=fitted_offersize_model.transmat_.flatten()
            offersize_params[dt]={"Mean":offersize_mu, "Sigma":offersize_covar,"Transition":offersize_transit}

            bookimbalance=dt_features['OB_IB'].reshape(-1, 1)
            bookimbalance_model=GaussianHMM(n_components=2,algorithm='viterbi',covariance_type="spherical",min_covar=1e-4, n_iter=1000,tol=1e-8)
            fitted_bookimbalance_model=bookimbalance_model.fit(bookimbalance)
            bookimbalance_mu=fitted_bookimbalance_model.means_.flatten()
            bookimbalance_covar=fitted_bookimbalance_model.covars_.flatten()
            bookimbalance_transit=fitted_bookimbalance_model.transmat_.flatten()
            bookimbalance_params[dt]={"Mean":bookimbalance_mu, "Sigma":bookimbalance_covar,"Transition":bookimbalance_transit}


    else:
        print("Not a valid method")
        return

    features_labels=["spread","bidsize","offersize","bookimbalance"]
    spread_df=extract_params(spread_params,method)
    bidsize_df=extract_params(bidsize_params,method)
    offersize_df=extract_params(offersize_params,method)
    bookimbalance_df=extract_params(bookimbalance_params,method)

    dict_df=dict(zip(features_labels,[spread_df,bidsize_df,offersize_df,bookimbalance_df]))
    return dict_df

### PSG


In [21]:
psg_df=fit_hmm(method=1)

OK. Problem Imported

Running solver
Reading problem formulation
Asking for data information
Getting data
    100.0% of scenarios is processed
100% of vector_spread was read
Start optimization
Ext.iteration=0  Objective=0.740725099987E+00  Residual=0.000000000000E+00
Ext.iteration=10  Objective=0.740725099987E+00  Residual=0.000000000000E+00
Optimization is stopped
Solution is optimal
Calculating resulting outputs. Writing solution.
Objective: objective = 32086.1760096 [-4.512213776820E+16]
Solver has normally finished. Solution was saved.
Problem: problem_hmm_normal, solution_status = optimal
Timing: data_loading_time = 0.09, preprocessing_time = 11.73, solving_time = 0.91
Variables: optimal_point = point_problem_hmm_normal
Objective: objective = 32086.1760096 [-4.512213776820E+16]
Constraint: sum_of_probabilities_for_states = vector_sum_of_probabilities_for_states
Function: hmm_normal(2,vector_spread) =  3.208617600959E+04
OK. Solver Finished

OK. Problem Imported

Running solver
Rea

In [22]:
for feature,df in psg_df.items():
    print(f"Feature {feature} df :")
    df.to_csv(f'data/results/psg_{feature}.csv')

Feature spread df :
Feature bidsize df :
Feature offersize df :
Feature bookimbalance df :


### HMM Model

In [18]:
hmm_df=fit_hmm(method=2)

Fitting HMM usign HMM-Learn for 2020-01-02
Fitting HMM usign HMM-Learn for 2020-01-03
Fitting HMM usign HMM-Learn for 2020-01-06
Fitting HMM usign HMM-Learn for 2020-01-07
Fitting HMM usign HMM-Learn for 2020-01-08
Fitting HMM usign HMM-Learn for 2020-01-09
Fitting HMM usign HMM-Learn for 2020-01-10
Fitting HMM usign HMM-Learn for 2020-01-13
Fitting HMM usign HMM-Learn for 2020-01-14
Fitting HMM usign HMM-Learn for 2020-01-15
Fitting HMM usign HMM-Learn for 2020-01-16
Fitting HMM usign HMM-Learn for 2020-01-17
Fitting HMM usign HMM-Learn for 2020-01-21
Fitting HMM usign HMM-Learn for 2020-01-22
Fitting HMM usign HMM-Learn for 2020-01-23
Fitting HMM usign HMM-Learn for 2020-01-24
Fitting HMM usign HMM-Learn for 2020-01-27
Fitting HMM usign HMM-Learn for 2020-01-28
Fitting HMM usign HMM-Learn for 2020-01-29
Fitting HMM usign HMM-Learn for 2020-01-30


In [19]:
for feature,df in hmm_df.items():
    print(f"Feature {feature} df :")
    df.to_csv(f'data/results/hmm_{feature}.csv')

Feature spread df :
                 mu1       mu2    sigma1    sigma2       a11       a12  \
2020-01-02  0.036116  0.106534  0.000115  0.008167  0.943314  0.056686   
2020-01-03  0.038206  0.139014  0.000112  0.010156  0.924463  0.075537   
2020-01-06  0.036967  0.121273  0.000089  0.009514  0.938933  0.061067   
2020-01-07  0.034913  0.119909  0.000096  0.008083  0.941334  0.058666   
2020-01-08  0.043897  0.150747  0.000190  0.011675  0.929943  0.070057   
2020-01-09  0.039070  0.115007  0.000085  0.005854  0.903606  0.096394   
2020-01-10  0.040480  0.116082  0.000105  0.005972  0.918102  0.081898   
2020-01-13  0.047738  0.146045  0.000275  0.010029  0.964772  0.035228   
2020-01-14  0.049090  0.206467  0.000210  0.057600  0.953959  0.046041   
2020-01-15  0.053960  0.238875  0.000327  0.082092  0.883546  0.116454   
2020-01-16  0.043975  0.128213  0.000125  0.006779  0.850140  0.149860   
2020-01-17  0.114830  0.036835  0.006030  0.000120  0.694066  0.305934   
2020-01-21  0.1457

### Parameter Estimates for HMM on Spread

In [23]:
psg_df['spread'].describe()

Unnamed: 0,a11,a12,a21,a22,mu1,si1,mu2,si2
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.806272,0.193728,0.133529,0.866471,0.151345,0.085225,0.091133,0.058248
std,0.105667,0.105667,0.098822,0.098822,0.145021,0.059134,0.084328,0.082582
min,0.56927,0.046254,0.028994,0.660056,0.036078,0.010642,0.034876,0.009084
25%,0.729465,0.103503,0.068379,0.825096,0.099377,0.061805,0.039999,0.010491
50%,0.797302,0.202698,0.094486,0.905514,0.122642,0.087175,0.045992,0.01423
75%,0.896497,0.270535,0.174904,0.931621,0.146913,0.100908,0.113236,0.092674
max,0.953746,0.43073,0.339944,0.971006,0.709377,0.257812,0.342463,0.286053


In [24]:
hmm_df['spread'].describe()

Unnamed: 0,mu1,mu2,sigma1,sigma2,a11,a12,a21,a22
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.087988,0.1549,0.004322,0.016183,0.890646,0.109354,0.21705,0.78295
std,0.148929,0.074406,0.014862,0.020204,0.078843,0.078843,0.102974,0.102974
min,0.034913,0.036835,8.5e-05,0.00012,0.673841,0.035228,0.028971,0.568398
25%,0.040046,0.118953,0.000112,0.007077,0.877213,0.060467,0.140751,0.72339
50%,0.045857,0.136922,0.0002,0.009771,0.918721,0.081279,0.241075,0.758925
75%,0.055977,0.175794,0.000343,0.011787,0.939533,0.122787,0.27661,0.859249
max,0.709399,0.342486,0.066525,0.082092,0.964772,0.326159,0.431602,0.971029


### Parameter Estimates for HMM on Bidsize

In [25]:
psg_df['bidsize'].describe()

Unnamed: 0,a11,a12,a21,a22,mu1,si1,mu2,si2
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.775628,0.224372,0.057393,0.942607,1.466907,0.370816,1.070006,0.219555
std,0.215686,0.215686,0.041205,0.041205,0.206305,0.169462,0.098283,0.023631
min,0.1439,0.010338,0.001707,0.864498,1.21304,0.149726,0.930758,0.177427
25%,0.643866,0.037887,0.019853,0.910883,1.27958,0.258228,0.960312,0.20053
50%,0.8398,0.1602,0.062197,0.937803,1.4243,0.311933,1.072182,0.221013
75%,0.962113,0.356134,0.089117,0.980147,1.56967,0.441264,1.149388,0.235914
max,0.989662,0.8561,0.135502,0.998293,1.964026,0.801932,1.20665,0.270283


In [26]:
hmm_df['bidsize'].describe()

Unnamed: 0,mu1,mu2,sigma1,sigma2,a11,a12,a21,a22
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,1.195091,1.121738,0.069131,0.066687,0.8067143,0.193286,0.244499,0.7555011
std,0.180027,0.172313,0.034515,0.073404,0.2868913,0.286891,0.344051,0.3440514
min,0.943627,0.929188,4e-06,3e-06,1.031425e-77,0.014781,0.010337,2.004281e-101
25%,1.093409,0.960333,0.045991,0.037625,0.7908668,0.025046,0.040901,0.7946621
50%,1.215788,1.098612,0.06638,0.049945,0.932981,0.067019,0.090018,0.9099817
75%,1.272639,1.230534,0.086884,0.067764,0.9749537,0.209133,0.205338,0.9590991
max,1.698405,1.561602,0.140773,0.354117,0.9852187,1.0,1.0,0.9896627


### Parameter Estimates for HMM on OfferSize

In [27]:
psg_df['offersize'].describe()

Unnamed: 0,a11,a12,a21,a22,mu1,si1,mu2,si2
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.799846,0.200154,0.104828,0.895172,1.433176,0.36173,1.103561,0.210331
std,0.266062,0.266062,0.21827,0.21827,0.290408,0.1912,0.108998,0.059809
min,0.0,0.00576,0.000442,0.0,1.174642,0.113905,0.862478,1e-06
25%,0.731759,0.043505,0.006716,0.874183,1.25897,0.269635,1.018361,0.199433
50%,0.939493,0.060507,0.028757,0.971243,1.299874,0.277071,1.089679,0.215657
75%,0.956495,0.268241,0.125817,0.993284,1.526534,0.351048,1.208586,0.23302
max,0.99424,1.0,1.0,0.999558,2.141238,0.866924,1.243101,0.293308


In [28]:
hmm_df['offersize'].describe()

Unnamed: 0,mu1,mu2,sigma1,sigma2,a11,a12,a21,a22
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,1.165297,1.147013,0.055883,0.074144,0.7931783,0.206822,0.284084,0.7159159
std,0.114616,0.137965,0.029966,0.118996,0.2971967,0.297197,0.38705,0.3870501
min,0.973117,0.862708,4e-06,3e-06,3.37831e-96,0.002617,0.006291,8.361729999999999e-185
25%,1.076817,1.047047,0.039338,0.035014,0.7341756,0.02237,0.04351,0.603228
50%,1.183235,1.136611,0.049489,0.050886,0.9075243,0.092476,0.058838,0.9411617
75%,1.263334,1.266643,0.077047,0.07385,0.97763,0.265824,0.396772,0.95649
max,1.308572,1.339824,0.116181,0.563636,0.9973826,1.0,1.0,0.9937095


### Parameter Estimes for HMM on BookImbalance

In [29]:
psg_df['bookimbalance'].describe()

Unnamed: 0,a11,a12,a21,a22,mu1,si1,mu2,si2
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.859427,0.140573,0.104604,0.895396,1.014332,0.316166,0.673538,0.211919
std,0.084247,0.084247,0.066329,0.066329,0.1476,0.112522,0.110196,0.023327
min,0.639234,0.02358,0.030673,0.761885,0.714782,0.194499,0.503831,0.169518
25%,0.83837,0.097462,0.043665,0.849729,0.966512,0.284053,0.613157,0.197274
50%,0.881739,0.118261,0.087388,0.912612,1.004996,0.297682,0.666259,0.210018
75%,0.902538,0.16163,0.150271,0.956335,1.019162,0.30865,0.709919,0.234427
max,0.97642,0.360766,0.238115,0.969327,1.522145,0.78056,1.050343,0.248889


In [30]:
hmm_df['bookimbalance'].describe()

Unnamed: 0,mu1,mu2,sigma1,sigma2,a11,a12,a21,a22
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.859241,0.816309,0.089726,0.065681,0.8293533,0.170647,0.116675,0.883325
std,0.236757,0.185198,0.127546,0.021309,0.2085553,0.208555,0.071775,0.071775
min,0.503867,0.557913,4e-06,0.028746,2.8382240000000004e-29,0.023577,0.030672,0.700145
25%,0.684224,0.663985,0.041626,0.045665,0.8289982,0.07442,0.057318,0.846215
50%,0.831913,0.758641,0.066274,0.061163,0.8825577,0.117442,0.11702,0.88298
75%,1.008324,1.00509,0.089308,0.08615,0.9255796,0.171002,0.153785,0.942682
max,1.476736,1.124476,0.61782,0.095574,0.9764234,1.0,0.299855,0.969328
