In [1]:
import numpy as np
import pandas as pd
from hmmlearn.hmm import GaussianHMM
from datetime import date,timedelta


import os
os.add_dll_directory('C:\Aorda\PSG\lib')
import psgpython as psg 
from psg_loader import load_psg


### Script to iteratively Fit HMM model 
- Method parameter allows one to fit with PSG or HMMLearn
- Utilizes all potential features

In [4]:

def remove_duplicates(series):
    """ Remove identical consecutive observations
    """
    cleaned_series=series[np.insert(np.diff(series).astype(bool), 0, True)]
    dropped_els=len(series)-len(cleaned_series)
    
    return cleaned_series


def prep_features(dt):
    """ Prep features by accessing grouped feature csv 
    """
    grouped_features=pd.read_csv(f'data/agg_features/grouped_features_{dt}.csv')
    bidsize=remove_duplicates(grouped_features['Bid_Size'].values)
    offersize=remove_duplicates(grouped_features['Offer_Size'].values)
    bookimbalance=remove_duplicates(grouped_features['OB_IB'].values)
    spread=remove_duplicates(grouped_features['spread'].values)

    feature_dict=dict(zip(['Bid_Size','Offer_Size','OB_IB','spread'],[bidsize,offersize,bookimbalance,spread]))

    return feature_dict

def extract_params(param_dict,method):
    """ Extract Params from optimized model 
    """
    if method==1:
        param_df=pd.DataFrame.from_dict(param_dict,orient='index',columns=['p1','p2','a11','a12', 'a21', 'a22','mu1','si1','mu2','si2'])
        return param_df

    elif method==2:
        param_df=pd.DataFrame.from_dict(param_dict,orient='index')
        means= pd.DataFrame(param_df['Mean'].to_list(), columns = ['mu1', 'mu2'])
        covar=pd.DataFrame(param_df['Sigma'].to_list(), columns = ['sigma1', 'sigma2'])
        dates=pd.DataFrame(param_df.index,columns=['date'])
        
        new_param_df=pd.DataFrame(pd.concat([means,covar,dates],axis=1))
        new_param_df.index=new_param_df['date'].values
        new_param_df=new_param_df.drop(columns=['date'])
        return new_param_df


def fit_hmm(method):
    """ Fit HMM model with PSG and HMMLearn 
    """
    start=date(2020,1,1)
    days=[start+timedelta(days=i) for i in range(0,3)]

    spread_params={}
    
    # psg training
    if method==1:
        for dt in days:
            try:
                dt_features=prep_features(dt)
            except:
                continue
            
            # formatted as numpy float 
            np.savetxt(r'psg_text_hmm/vector_bidsize.txt', dt_features['Bid_Size'])
            np.savetxt(r'psg_text_hmm/vector_offersize.txt', dt_features['Offer_Size'])
            np.savetxt(r'psg_text_hmm/vector_bookimbalance.txt', dt_features['OB_IB'])
            np.savetxt(r'psg_text_hmm/vector_spread.txt', dt_features['spread'])


            psg_spread_prob = psg.psg_importfromtext('./psg_text_hmm/problem_hmm_normal_spread.txt')
            psg_spread_prob['problem_statement'] = '\n'.join(psg_spread_prob['problem_statement'])
            spread_solution=psg.psg_solver(psg_spread_prob)
            params=list(spread_solution.values())[4][1]
            spread_params[dt]=params
            
    elif method==2:
        for dt in days:
            
            try:
                dt_features=prep_features(dt)
            except:
                continue
            print(f"Fitting HMM usign HMM-Learn for {dt}")
            spread=dt_features['spread'].reshape(-1, 1)
            spread_model=GaussianHMM(n_components=2,algorithm='viterbi',covariance_type="spherical",min_covar=1e-4, n_iter=1000,tol=1e-8)
            fitted_spread_model=spread_model.fit(spread)
            spread_mu=fitted_spread_model.means_.flatten()
            spread_covar=fitted_spread_model.covars_.flatten()
            spread_params[dt]={"Mean":spread_mu, "Sigma":spread_covar}


    else:
        print("Not a valid method")
        return

    spread_df=extract_params(spread_params,method)
    return spread_df

### PSG


In [5]:
psg_spread_df=fit_hmm(method=1)

OK. Problem Imported

Running solver
Reading problem formulation
Asking for data information
Getting data
    100.0% of scenarios is processed
100% of vector_spread was read
Start optimization
Ext.iteration=0  Objective=0.740725099987E+00  Residual=0.000000000000E+00
Ext.iteration=10  Objective=0.740725099987E+00  Residual=0.000000000000E+00
Optimization is stopped
Solution is optimal
Calculating resulting outputs. Writing solution.
Objective: objective = 32086.1760096 [-4.512213776820E+16]
Solver has normally finished. Solution was saved.
Problem: problem_hmm_normal, solution_status = optimal
Timing: data_loading_time = 0.11, preprocessing_time = 9.19, solving_time = 0.80
Variables: optimal_point = point_problem_hmm_normal
Objective: objective = 32086.1760096 [-4.512213776820E+16]
Constraint: sum_of_probabilities_for_states = vector_sum_of_probabilities_for_states
Function: hmm_normal(2,vector_spread) =  3.208617600959E+04
OK. Solver Finished

OK. Problem Imported

Running solver
Read

In [6]:
spread_df

Unnamed: 0,p1,p2,a11,a12,a21,a22,mu1,si1,mu2,si2
2020-01-02,0.0,1.0,0.942808,0.057192,0.258518,0.741482,0.036078,0.010642,0.106166,0.090109
2020-01-03,1.0,0.0,0.763903,0.236097,0.076161,0.923839,0.138587,0.100653,0.038164,0.010488


### HMM Model

In [6]:
spread_df=fit_hmm(method=2)

Fitting HMM usign HMM-Learn for 2020-01-02
Fitting HMM usign HMM-Learn for 2020-01-03
Fitting HMM usign HMM-Learn for 2020-01-06
Fitting HMM usign HMM-Learn for 2020-01-07
Fitting HMM usign HMM-Learn for 2020-01-08
Fitting HMM usign HMM-Learn for 2020-01-09
Fitting HMM usign HMM-Learn for 2020-01-10
Fitting HMM usign HMM-Learn for 2020-01-13
Fitting HMM usign HMM-Learn for 2020-01-14
Fitting HMM usign HMM-Learn for 2020-01-15
Fitting HMM usign HMM-Learn for 2020-01-16
Fitting HMM usign HMM-Learn for 2020-01-17
Fitting HMM usign HMM-Learn for 2020-01-21
Fitting HMM usign HMM-Learn for 2020-01-22
Fitting HMM usign HMM-Learn for 2020-01-23
Fitting HMM usign HMM-Learn for 2020-01-24
Fitting HMM usign HMM-Learn for 2020-01-27
Fitting HMM usign HMM-Learn for 2020-01-28
Fitting HMM usign HMM-Learn for 2020-01-29
Fitting HMM usign HMM-Learn for 2020-01-30


In [7]:
spread_df

Unnamed: 0,mu1,mu2,sigma1,sigma2
2020-01-02,0.036116,0.106534,0.000115,0.008167
2020-01-03,0.038183,0.138803,0.000111,0.010144
2020-01-06,0.036967,0.121273,8.9e-05,0.009514
2020-01-07,0.119909,0.034913,0.008083,9.6e-05
2020-01-08,0.043897,0.150747,0.00019,0.011675
2020-01-09,0.03907,0.115007,8.5e-05,0.005854
2020-01-10,0.04048,0.116082,0.000105,0.005972
2020-01-13,0.047729,0.146048,0.000275,0.010025
2020-01-14,0.04909,0.206467,0.00021,0.0576
2020-01-15,0.05396,0.238875,0.000327,0.082092


In [8]:
spread_df.describe()

Unnamed: 0,mu1,mu2,sigma1,sigma2
count,20.0,20.0,20.0,20.0
mean,0.099785,0.143107,0.005205,0.0153
std,0.150853,0.07953,0.014875,0.020693
min,0.036116,0.034913,8.5e-05,9.6e-05
25%,0.040453,0.112889,0.000114,0.005943
50%,0.046043,0.131521,0.0002,0.00977
75%,0.086877,0.152228,0.002036,0.010696
max,0.709344,0.342477,0.066529,0.082092


### Plot of Values across Time


### Examine Stationary Distribution