In [1]:
import numpy as np
import pandas as pd
from hmmlearn.hmm import GaussianHMM
from scipy.stats import t
from datetime import date,timedelta


import os
os.add_dll_directory('C:\Aorda\PSG\lib')
import psgpython as psg 
from psg_loader import load_psg


### Script to iteratively Fit HMM model 
- Flexibility to fit via PSG constrained optimization starting with Baum-Welch or Hmmlearn Viterbi algorithm

In [2]:
def remove_duplicates(series):
    """ Remove identical consecutive observations
    """
    cleaned_series=series[np.insert(np.diff(series).astype(bool), 0, True)]
    dropped_els=len(series)-len(cleaned_series)
    
    return cleaned_series


def prep_features(dt):
    """ Prep features by accessing grouped feature csv 
    """
    grouped_features=pd.read_csv(f'data/agg_features/grouped_features_{dt}.csv')
    bidsize=remove_duplicates(grouped_features['Bid_Size'].values)
    offersize=remove_duplicates(grouped_features['Offer_Size'].values)
    bookimbalance=remove_duplicates(grouped_features['OB_IB'].values)
    spread=remove_duplicates(grouped_features['spread'].values)

    feature_dict=dict(zip(['Bid_Size','Offer_Size','OB_IB','spread'],[bidsize,offersize,bookimbalance,spread]))

    return feature_dict


In [25]:
def compute_pval(row):
    """ Perform two t-test to determine if distributions are significantly different 
    """
    test_stat=(row.loc['mu2']-row.loc['mu1'])/np.sqrt((row.loc['sigma1']**2)+(row.loc['sigma2']**2)/20)
  
    df_num=((row.loc['sigma1']**2)+(row.loc['sigma2']**2)/20)**2
    df_dem=(((row.loc['sigma1']**2)/20)**2)/19+(((row.loc['sigma2']**2)/20)**2)/19
    df=df_num/df_dem
   

    p_val=2*np.round(t.cdf(-abs(test_stat),df),4)
    return p_val

In [3]:
def extract_params(param_dict,method):
    """ Extract Params from optimized model 
    """
    if method==1:
        param_df=pd.DataFrame.from_dict(param_dict,orient='index',columns=['p1','p2','a11','a12', 'a21', 'a22','mu1','si1','mu2','si2'])
        param_df=param_df.rename(columns={'si1':'sigma1','si2':'sigma2'})
        
        p_vals=[]
        # standardizing mu1 < mu2 as arbitrarily labeled, swapping mus and sigmas if needed 
        for x,row in param_df.iterrows():

            p_vals.append(compute_pval(row))

            if row.loc['mu1']<row.loc['mu2']:
                continue
            
            mu1,mu2,sigma1,sigma2=row.loc['mu1'],row.loc['mu2'],row.loc['sigma1'],row.loc['sigma2']
            
            row.loc['mu1'],row.loc['mu2'],row.loc['sigma1'],row.loc['sigma2']=mu2,mu1,sigma2,sigma1

        param_df=param_df.drop(columns=['p1','p2'])
        param_df['p_val']=pd.Series(p_vals,index=param_df.index)
        return param_df

    elif method==2:
        param_df=pd.DataFrame.from_dict(param_dict,orient='index')
        means= pd.DataFrame(param_df['Mean'].to_list(),columns=['mu1','mu2'])
        covar=pd.DataFrame(param_df['Sigma'].to_list(), columns = ['sigma1', 'sigma2'])
        transit=pd.DataFrame(param_df['Transition'].to_list(), columns = ['a11', 'a12','a21','a22'])
        dates=pd.DataFrame(param_df.index,columns=['date'])
        
        new_param_df=pd.DataFrame(pd.concat([means,covar,transit,dates],axis=1))
        new_param_df.index=new_param_df['date'].values
        new_param_df=new_param_df.drop(columns=['date'])

        p_vals=[]

       # standardizing mu1 < mu2 as arbitrarily labeled, swapping mus and sigmas if needed 
        for x,row in new_param_df.iterrows():
            p_vals.append(compute_pval(row))
            if row.loc['mu1']<row.loc['mu2']:
                continue
            mu1,mu2,sigma1,sigma2=row.loc['mu1'],row.loc['mu2'],row.loc['sigma1'],row.loc['sigma2']
            
            row.loc['mu1'],row.loc['mu2'],row.loc['sigma1'],row.loc['sigma2']=mu2,mu1,sigma2,sigma1

        new_param_df=new_param_df[['a11','a12','a21','a22','mu1','sigma1','mu2','sigma2']]
        
        new_param_df['p_val']=pd.Series(p_vals,index=new_param_df.index)
        return new_param_df


def fit_hmm(method):
    """ Fit HMM model with PSG and HMMLearn 
    """
    start=date(2020,1,1)
    days=[start+timedelta(days=i) for i in range(0,30)]

    spread_params={}
    bidsize_params={}
    offersize_params={}
    bookimbalance_params={}
    
    # psg training
    if method==1:
        for dt in days:
            try:
                dt_features=prep_features(dt)
            except:
                continue
            
            # formatted as numpy float 
            np.savetxt(r'psg_text_hmm/vector_bidsize.txt', dt_features['Bid_Size'])
            np.savetxt(r'psg_text_hmm/vector_offersize.txt', dt_features['Offer_Size'])
            np.savetxt(r'psg_text_hmm/vector_bookimbalance.txt', dt_features['OB_IB'])
            np.savetxt(r'psg_text_hmm/vector_spread.txt', dt_features['spread'])


            psg_spread_prob = psg.psg_importfromtext('./psg_text_hmm/problem_hmm_normal_spread.txt')
            psg_spread_prob['problem_statement'] = '\n'.join(psg_spread_prob['problem_statement'])
            spread_solution=psg.psg_solver(psg_spread_prob)
            params=list(spread_solution.values())[4][1]
            spread_params[dt]=params

            psg_bidsize_prob = psg.psg_importfromtext('./psg_text_hmm/problem_hmm_normal_bidsize.txt')
            psg_bidsize_prob['problem_statement'] = '\n'.join(psg_bidsize_prob['problem_statement'])
            bidsize_solution=psg.psg_solver(psg_bidsize_prob)
            params=list(bidsize_solution.values())[4][1]
            bidsize_params[dt]=params

            psg_offersize_prob = psg.psg_importfromtext('./psg_text_hmm/problem_hmm_normal_offersize.txt')
            psg_offersize_prob['problem_statement'] = '\n'.join(psg_offersize_prob['problem_statement'])
            offersize_solution=psg.psg_solver(psg_offersize_prob)
            params=list(offersize_solution.values())[4][1]
            offersize_params[dt]=params

            psg_bookimbalance_prob = psg.psg_importfromtext('./psg_text_hmm/problem_hmm_normal_bookimbalance.txt')
            psg_bookimbalance_prob['problem_statement'] = '\n'.join(psg_bookimbalance_prob['problem_statement'])
            bookimbalance_solution=psg.psg_solver(psg_bookimbalance_prob)
            params=list(bookimbalance_solution.values())[4][1]
            bookimbalance_params[dt]=params
            
    elif method==2:
        for dt in days:
            
            try:
                dt_features=prep_features(dt)
            except:
                continue
            print(f"Fitting HMM usign HMM-Learn for {dt}")
            
            
            spread=dt_features['spread'].reshape(-1, 1)
            spread_model=GaussianHMM(n_components=2,algorithm='viterbi',covariance_type="spherical",min_covar=1e-4, n_iter=1000,tol=1e-8)
            fitted_spread_model=spread_model.fit(spread)
            spread_mu=fitted_spread_model.means_.flatten()
            spread_covar=fitted_spread_model.covars_.flatten()
            spread_transit=fitted_spread_model.transmat_.flatten()
            spread_params[dt]={"Mean":spread_mu, "Sigma":spread_covar,"Transition":spread_transit}

            bidsize=dt_features['Bid_Size'].reshape(-1, 1)
            bidsize_model=GaussianHMM(n_components=2,algorithm='viterbi',covariance_type="spherical",min_covar=1e-4, n_iter=1000,tol=1e-8)
            fitted_bidsize_model=bidsize_model.fit(bidsize)
            bidsize_mu=fitted_bidsize_model.means_.flatten()
            bidsize_covar=fitted_bidsize_model.covars_.flatten()
            bidsize_transit=fitted_bidsize_model.transmat_.flatten()
            bidsize_params[dt]={"Mean":bidsize_mu, "Sigma":bidsize_covar,"Transition":bidsize_transit}

            offersize=dt_features['Offer_Size'].reshape(-1, 1)
            offersize_model=GaussianHMM(n_components=2,algorithm='viterbi',covariance_type="spherical",min_covar=1e-4, n_iter=1000,tol=1e-8)
            fitted_offersize_model=offersize_model.fit(offersize)
            offersize_mu=fitted_offersize_model.means_.flatten()
            offersize_covar=fitted_offersize_model.covars_.flatten()
            offersize_transit=fitted_offersize_model.transmat_.flatten()
            offersize_params[dt]={"Mean":offersize_mu, "Sigma":offersize_covar,"Transition":offersize_transit}

            bookimbalance=dt_features['OB_IB'].reshape(-1, 1)
            bookimbalance_model=GaussianHMM(n_components=2,algorithm='viterbi',covariance_type="spherical",min_covar=1e-4, n_iter=1000,tol=1e-8)
            fitted_bookimbalance_model=bookimbalance_model.fit(bookimbalance)
            bookimbalance_mu=fitted_bookimbalance_model.means_.flatten()
            bookimbalance_covar=fitted_bookimbalance_model.covars_.flatten()
            bookimbalance_transit=fitted_bookimbalance_model.transmat_.flatten()
            bookimbalance_params[dt]={"Mean":bookimbalance_mu, "Sigma":bookimbalance_covar,"Transition":bookimbalance_transit}


    else:
        print("Not a valid method")
        return

    features_labels=["spread","bidsize","offersize","bookimbalance"]
    spread_df=extract_params(spread_params,method)
    bidsize_df=extract_params(bidsize_params,method)
    offersize_df=extract_params(offersize_params,method)
    bookimbalance_df=extract_params(bookimbalance_params,method)

    dict_df=dict(zip(features_labels,[spread_df,bidsize_df,offersize_df,bookimbalance_df]))
    return dict_df

### PSG


In [4]:
psg_df=fit_hmm(method=1)

OK. Problem Imported

Running solver
Reading problem formulation
Asking for data information
Getting data
100% of vector_spread was read
Start optimization
Ext.iteration=0  Objective=0.740725099987E+00  Residual=0.000000000000E+00
Ext.iteration=10  Objective=0.740725099987E+00  Residual=0.000000000000E+00
Optimization is stopped
Solution is optimal
Calculating resulting outputs. Writing solution.
Objective: objective = 32086.1760096 [-4.512213776820E+16]
Solver has normally finished. Solution was saved.
Problem: problem_hmm_normal, solution_status = optimal
Timing: data_loading_time = 0.12, preprocessing_time = 10.24, solving_time = 0.98
Variables: optimal_point = point_problem_hmm_normal
Objective: objective = 32086.1760096 [-4.512213776820E+16]
Constraint: sum_of_probabilities_for_states = vector_sum_of_probabilities_for_states
Function: hmm_normal(2,vector_spread) =  3.208617600959E+04
OK. Solver Finished

OK. Problem Imported

Running solver
Reading problem formulation
Asking for d

In [5]:
for feature,df in psg_df.items():
    print(f"Feature {feature} df saved")
    df.to_csv(f'data/results/psg_{feature}.csv')
    psg_df[feature]=df.rename(columns={'si1':'sigma1','si2':'sigma2'})

Feature spread df saved
Feature bidsize df saved
Feature offersize df saved
Feature bookimbalance df saved


### HMM Model

In [6]:
hmm_df=fit_hmm(method=2)

Fitting HMM usign HMM-Learn for 2020-01-02
Fitting HMM usign HMM-Learn for 2020-01-03
Fitting HMM usign HMM-Learn for 2020-01-06
Fitting HMM usign HMM-Learn for 2020-01-07
Fitting HMM usign HMM-Learn for 2020-01-08
Fitting HMM usign HMM-Learn for 2020-01-09
Fitting HMM usign HMM-Learn for 2020-01-10
Fitting HMM usign HMM-Learn for 2020-01-13
Fitting HMM usign HMM-Learn for 2020-01-14
Fitting HMM usign HMM-Learn for 2020-01-15
Fitting HMM usign HMM-Learn for 2020-01-16
Fitting HMM usign HMM-Learn for 2020-01-17
Fitting HMM usign HMM-Learn for 2020-01-21
Fitting HMM usign HMM-Learn for 2020-01-22
Fitting HMM usign HMM-Learn for 2020-01-23
Fitting HMM usign HMM-Learn for 2020-01-24
Fitting HMM usign HMM-Learn for 2020-01-27
Fitting HMM usign HMM-Learn for 2020-01-28
Fitting HMM usign HMM-Learn for 2020-01-29
Fitting HMM usign HMM-Learn for 2020-01-30


In [7]:
for feature,df in hmm_df.items():
    print(f"Feature {feature} df saved")
    df.to_csv(f'data/results/hmm_{feature}.csv')

Feature spread df saved
Feature bidsize df saved
Feature offersize df saved
Feature bookimbalance df saved


### Parameter Estimates for HMM on Spread

In [8]:
psg_df['spread'].describe()

Unnamed: 0,a11,a12,a21,a22,mu1,sigma1,mu2,sigma2,p_val
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.806272,0.193728,0.133529,0.866471,0.060623,0.020881,0.181856,0.122592,0.22218
std,0.105667,0.105667,0.098822,0.098822,0.067114,0.034069,0.133457,0.063987,0.139621
min,0.56927,0.046254,0.028994,0.660056,0.034876,0.009084,0.106166,0.076417,0.0002
25%,0.729465,0.103503,0.068379,0.825096,0.03881,0.010491,0.12053,0.088471,0.122
50%,0.797302,0.202698,0.094486,0.905514,0.043875,0.011771,0.141912,0.100189,0.3122
75%,0.896497,0.270535,0.174904,0.931621,0.05029,0.015209,0.17549,0.108457,0.32275
max,0.953746,0.43073,0.339944,0.971006,0.342463,0.164497,0.709377,0.286053,0.3888


In [9]:
hmm_df['spread'].describe()

Unnamed: 0,a11,a12,a21,a22,mu1,sigma1,mu2,sigma2,p_val
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.856072,0.143928,0.182396,0.817604,0.060675,0.001542,0.18223,0.018964,0.00032
std,0.090867,0.090867,0.117822,0.117822,0.067109,0.006011,0.133382,0.022522,0.001431
min,0.67581,0.035228,0.028971,0.568398,0.034913,8.5e-05,0.106534,0.005854,0.0
25%,0.769243,0.075166,0.07938,0.72339,0.038854,0.000112,0.120932,0.007856,0.0
50%,0.896129,0.103871,0.155656,0.844344,0.043936,0.000141,0.14239,0.010063,0.0
75%,0.924834,0.230757,0.27661,0.92062,0.050359,0.000234,0.175794,0.011787,0.0
max,0.964772,0.32419,0.431602,0.971029,0.342482,0.027074,0.709367,0.082092,0.0064


In [28]:
psg_mu1,psg_mu2,psg_sigma1,psg_sigma2=psg_df['spread'].describe().loc['mean'][['mu1','mu2','sigma1','sigma2']]
hmm_mu1,hmm_mu2,hmm_sigma1,hmm_sigma2=hmm_df['spread'].describe().loc['mean'][['mu1','mu2','sigma1','sigma2']]

row1=pd.Series([psg_mu1,psg_sigma1,hmm_mu1,hmm_sigma1],index=['mu1','mu2','sigma1','sigma2'])
p_val1=compute_pval(row1)
row2=pd.Series([psg_mu2,psg_sigma2,hmm_mu2,hmm_sigma2],index=['mu1','mu2','sigma1','sigma2'])
p_val2=compute_pval(row2)

print(f"p-val for two sample t-test on equivalent population mean for normal distribution 1 is {p_val1}")
print(f"p-val for two sample t-test on equivalent population mean for normal distribution 2 is {p_val2}")

p-val for two sample t-test on equivalent population mean for normal distribution 1 is 0.5124
p-val for two sample t-test on equivalent population mean for normal distribution 2 is 0.745


### Parameter Estimates for HMM on Bidsize

In [12]:
psg_df['bidsize'].describe()

Unnamed: 0,a11,a12,a21,a22,mu1,sigma1,mu2,sigma2,p_val
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.775628,0.224372,0.057393,0.942607,1.070006,0.219555,1.466907,0.370816,0.28364
std,0.215686,0.215686,0.041205,0.041205,0.098283,0.023631,0.206305,0.169462,0.135481
min,0.1439,0.010338,0.001707,0.864498,0.930758,0.177427,1.21304,0.149726,0.0108
25%,0.643866,0.037887,0.019853,0.910883,0.960312,0.20053,1.27958,0.258228,0.2366
50%,0.8398,0.1602,0.062197,0.937803,1.072182,0.221013,1.4243,0.311933,0.2991
75%,0.962113,0.356134,0.089117,0.980147,1.149388,0.235914,1.56967,0.441264,0.3678
max,0.989662,0.8561,0.135502,0.998293,1.20665,0.270283,1.964026,0.801932,0.5344


In [13]:
hmm_df['bidsize'].describe()

Unnamed: 0,a11,a12,a21,a22,mu1,sigma1,mu2,sigma2,p_val
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.7863504,0.21365,0.175425,0.824575,1.030349,0.044345,1.329722,0.136273,0.12761
std,0.3107036,0.310704,0.264839,0.264839,0.096567,0.017696,0.178106,0.151944,0.309779
min,7.530599e-78,0.014511,0.002972,0.052203,0.930784,4e-06,1.202485,0.045376,0.0
25%,0.8589767,0.063792,0.02504,0.873013,0.946616,0.036621,1.228048,0.063986,0.0
50%,0.9111407,0.088859,0.0832,0.9168,0.988397,0.040154,1.268553,0.077598,0.0
75%,0.9362083,0.141023,0.126987,0.97496,1.083016,0.051964,1.368091,0.105676,0.0031
max,0.985489,1.0,0.947797,0.997028,1.218123,0.08606,1.964041,0.643222,0.9992


In [29]:
psg_mu1,psg_mu2,psg_sigma1,psg_sigma2=psg_df['bidsize'].describe().loc['mean'][['mu1','mu2','sigma1','sigma2']]
hmm_mu1,hmm_mu2,hmm_sigma1,hmm_sigma2=hmm_df['bidsize'].describe().loc['mean'][['mu1','mu2','sigma1','sigma2']]

row1=pd.Series([psg_mu1,psg_sigma1,hmm_mu1,hmm_sigma1],index=['mu1','mu2','sigma1','sigma2'])
p_val1=compute_pval(row1)
row2=pd.Series([psg_mu2,psg_sigma2,hmm_mu2,hmm_sigma2],index=['mu1','mu2','sigma1','sigma2'])
p_val2=compute_pval(row2)

print(f"p-val for two sample t-test on equivalent population mean for normal distribution 1 is {p_val1}")
print(f"p-val for two sample t-test on equivalent population mean for normal distribution 2 is {p_val2}")

p-val for two sample t-test on equivalent population mean for normal distribution 1 is 0.4092
p-val for two sample t-test on equivalent population mean for normal distribution 2 is 0.41


### Parameter Estimates for HMM on OfferSize

In [16]:
psg_df['offersize'].describe()

Unnamed: 0,a11,a12,a21,a22,mu1,sigma1,mu2,sigma2,p_val
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.799846,0.200154,0.104828,0.895172,1.10159,0.215371,1.435147,0.35669,0.42005
std,0.266062,0.266062,0.21827,0.21827,0.107237,0.060179,0.288689,0.195105,0.206082
min,0.0,0.00576,0.000442,0.0,0.862478,1e-06,1.18328,0.113905,0.0
25%,0.731759,0.043505,0.006716,0.874183,1.018361,0.202605,1.25897,0.269635,0.31355
50%,0.939493,0.060507,0.028757,0.971243,1.089679,0.218045,1.299874,0.277071,0.4084
75%,0.956495,0.268241,0.125817,0.993284,1.192509,0.241782,1.526534,0.351048,0.4548
max,0.99424,1.0,1.0,0.999558,1.243101,0.293308,2.141238,0.866924,0.884


In [17]:
hmm_df['offersize'].describe()

Unnamed: 0,a11,a12,a21,a22,mu1,sigma1,mu2,sigma2,p_val
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.902166,0.097834,0.189412,0.8105883,1.049033,0.042263,1.29044,0.09605,0.0285
std,0.134528,0.134528,0.302446,0.3024459,0.084946,0.018781,0.110227,0.111703,0.072304
min,0.483761,0.007416,0.00599,1.5670929999999998e-78,0.862487,4e-06,1.183283,0.027779,0.0
25%,0.885387,0.0145,0.044364,0.8686207,1.012534,0.038011,1.250299,0.066747,0.0
50%,0.936089,0.063911,0.066347,0.9336529,1.025127,0.042785,1.268117,0.073587,0.0007
75%,0.9855,0.114613,0.131379,0.9556364,1.085213,0.048534,1.29703,0.079215,0.00445
max,0.992584,0.516239,1.0,0.99401,1.241438,0.081422,1.731837,0.563636,0.2524


In [30]:
psg_mu1,psg_mu2,psg_sigma1,psg_sigma2=psg_df['offersize'].describe().loc['mean'][['mu1','mu2','sigma1','sigma2']]
hmm_mu1,hmm_mu2,hmm_sigma1,hmm_sigma2=hmm_df['offersize'].describe().loc['mean'][['mu1','mu2','sigma1','sigma2']]

row1=pd.Series([psg_mu1,psg_sigma1,hmm_mu1,hmm_sigma1],index=['mu1','mu2','sigma1','sigma2'])
p_val1=compute_pval(row1)
row2=pd.Series([psg_mu2,psg_sigma2,hmm_mu2,hmm_sigma2],index=['mu1','mu2','sigma1','sigma2'])
p_val2=compute_pval(row2)

print(f"p-val for two sample t-test on equivalent population mean for normal distribution 1 is {p_val1}")
print(f"p-val for two sample t-test on equivalent population mean for normal distribution 2 is {p_val2}")

p-val for two sample t-test on equivalent population mean for normal distribution 1 is 0.3982
p-val for two sample t-test on equivalent population mean for normal distribution 2 is 0.4034


### Parameter Estimes for HMM on BookImbalance

In [20]:
psg_df['bookimbalance'].describe()

Unnamed: 0,a11,a12,a21,a22,mu1,sigma1,mu2,sigma2,p_val
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.859427,0.140573,0.104604,0.895396,0.65676,0.2092,1.03111,0.318886,0.24524
std,0.084247,0.084247,0.066329,0.066329,0.066811,0.021918,0.12975,0.110057,0.083815
min,0.639234,0.02358,0.030673,0.761885,0.503831,0.169518,0.893228,0.248889,0.0974
25%,0.83837,0.097462,0.043665,0.849729,0.613157,0.195624,0.975975,0.284053,0.19775
50%,0.881739,0.118261,0.087388,0.912612,0.666259,0.208292,1.008497,0.297682,0.22
75%,0.902538,0.16163,0.150271,0.956335,0.709919,0.221199,1.030171,0.30865,0.2953
max,0.97642,0.360766,0.238115,0.969327,0.757812,0.245684,1.522145,0.78056,0.4784


In [21]:
hmm_df['bookimbalance'].describe()

Unnamed: 0,a11,a12,a21,a22,mu1,sigma1,mu2,sigma2,p_val
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.852053,0.147947,0.177779,0.822221,0.668791,0.047311,1.023075,0.114375,0.05003
std,0.198079,0.198079,0.195102,0.195102,0.073902,0.016683,0.137647,0.117038,0.223459
min,0.05204,0.023578,0.031643,0.082952,0.503867,0.028746,0.822523,0.061963,0.0
25%,0.847529,0.058891,0.086635,0.828997,0.627313,0.038272,0.966516,0.080691,0.0
50%,0.901873,0.098127,0.121242,0.878758,0.681109,0.043393,1.008494,0.08985,0.0
75%,0.941109,0.152471,0.171003,0.913365,0.713346,0.054974,1.030172,0.095753,0.0
max,0.976422,0.94796,0.917048,0.968357,0.822454,0.106469,1.522334,0.609441,0.9994


In [31]:
psg_mu1,psg_mu2,psg_sigma1,psg_sigma2=psg_df['bookimbalance'].describe().loc['mean'][['mu1','mu2','sigma1','sigma2']]
hmm_mu1,hmm_mu2,hmm_sigma1,hmm_sigma2=hmm_df['bookimbalance'].describe().loc['mean'][['mu1','mu2','sigma1','sigma2']]

row1=pd.Series([psg_mu1,psg_sigma1,hmm_mu1,hmm_sigma1],index=['mu1','mu2','sigma1','sigma2'])
p_val1=compute_pval(row1)
row2=pd.Series([psg_mu2,psg_sigma2,hmm_mu2,hmm_sigma2],index=['mu1','mu2','sigma1','sigma2'])
p_val2=compute_pval(row2)

print(f"p-val for two sample t-test on equivalent population mean for normal distribution 1 is {p_val1}")
print(f"p-val for two sample t-test on equivalent population mean for normal distribution 2 is {p_val2}")

p-val for two sample t-test on equivalent population mean for normal distribution 1 is 0.5034
p-val for two sample t-test on equivalent population mean for normal distribution 2 is 0.4864
