### Libraries import

In [1]:
# Import libraries
import numpy as np
import pandas as pd
from tqdm import tqdm
import functions as fun
import plotly.io as pio
import importlib as imp
import statsmodels.api as sm
import plotly.graph_objects as go
import sklearn.model_selection as modsel

# Set additional settings for warnings and templates
import warnings
warnings.filterwarnings("ignore")
pio.templates.default = "plotly_dark"
pd.set_option('display.max_columns', None)

In [21]:
# Reload functions if they were changed
imp.reload(fun)



### Feature generation

In [22]:
# Read dataset and define columns for feature generation
data = pd.read_parquet('Data/dataset.parquet').drop(columns = 'Variance')
indices = data.groupby(['Ticker', 'Index']).size().index.values
cols = ['Hurst', 'CorrDim', 'Lyapunov', 
        'Skewness', 'Kurtosis', 'PSD', 'ACF_1',
        'WL_C1', 'WL_C2', 'WL_C3']

# Set lag for dynamics and short variance calculation
lag_model = [8]

# Calculate dynamics and short variance
# Original idea about variance was born from the largest Lyapunov exponent's behaviour before the critical transition point:
# is mostly didn't move in nominal values but its variance in some cases decreased signigicantly 
data_logdyn = pd.DataFrame()
for ind in tqdm(indices):
    data_ind = data[(data['Ticker'] == ind[0]) & (data['Index'] == ind[1])]
    for col in cols:
        for lag_m in lag_model:
            data_ind[col + '_' + str(lag_m) + '_dyn'] = data_ind[col] / data_ind[col].shift(lag_m) - 1
            data_ind[col + '_' + str(lag_m) + '_Variance'] = data_ind[col].rolling(lag_m).var()
    data_ind.dropna(inplace = True)
    data_logdyn = pd.concat([data_logdyn, data_ind])

# Reset index to get rid of dates and save final dataset
data_logdyn.reset_index(drop = True, inplace = True)
data_logdyn = data_logdyn[data_logdyn['Distance'] > 0]
data_logdyn.to_parquet('Data/final_dataset.parquet')
data_logdyn

100%|██████████| 548/548 [00:17<00:00, 32.06it/s]


Unnamed: 0,Volume,MA100,MV100,Rise,Distance,Index,Ticker,Hurst,CorrDim,Lyapunov,Skewness,Kurtosis,PSD,ACF_1,WL_C1,WL_C2,WL_C3,Hurst_8_dyn,Hurst_8_Variance,CorrDim_8_dyn,CorrDim_8_Variance,Lyapunov_8_dyn,Lyapunov_8_Variance,Skewness_8_dyn,Skewness_8_Variance,Kurtosis_8_dyn,Kurtosis_8_Variance,PSD_8_dyn,PSD_8_Variance,ACF_1_8_dyn,ACF_1_8_Variance,WL_C1_8_dyn,WL_C1_8_Variance,WL_C2_8_dyn,WL_C2_8_Variance,WL_C3_8_dyn,WL_C3_8_Variance
0,23952.0,31070.20,7.130232e+08,False,291,2643,AAN,0.651904,-2.815337e-15,0.049050,4.481086,26.699359,-0.609296,0.374819,0.465914,-0.103608,-0.124637,-0.050382,0.002605,-0.391654,1.096794e-29,0.145294,0.000312,0.030011,0.002971,0.082711,0.754393,0.037030,0.000049,0.019152,0.000045,-0.045022,0.003071,0.285858,0.000373,-0.167933,0.001749
1,159410.0,32093.49,8.715063e+08,False,290,2643,AAN,0.662030,-9.789577e-17,0.048384,4.402063,25.787151,-0.612294,0.367293,0.418223,-0.130413,-0.086043,0.050584,0.002415,224.435763,1.094419e-29,0.111672,0.000318,0.012617,0.002558,0.046669,0.643472,0.038410,0.000053,-0.000925,0.000044,-0.009408,0.003095,0.559503,0.000450,-0.162758,0.001908
2,74965.0,32664.47,8.876980e+08,False,289,2643,AAN,0.690370,-2.362195e-15,0.017922,4.390049,25.683490,-0.614583,0.371478,0.475433,-0.103349,-0.175004,0.204455,0.000927,-1.947039,9.171144e-30,-0.676723,0.000350,0.011164,0.001981,0.044073,0.473688,0.038391,0.000054,0.009061,0.000048,-0.046778,0.002778,0.087733,0.000436,-0.030778,0.001826
3,57275.0,32998.33,8.929255e+08,False,288,2643,AAN,0.693400,-2.719365e-15,0.019476,4.389790,25.681349,-0.618153,0.370974,0.380598,-0.146165,-0.076268,0.038265,0.000894,-0.439684,7.705216e-30,-0.652200,0.000322,0.011460,0.001289,0.044543,0.257182,0.040634,0.000055,0.012792,0.000050,0.045300,0.002463,0.182044,0.000618,-0.023034,0.001851
4,36764.0,32491.61,8.628754e+08,False,287,2643,AAN,0.670705,2.596383e-15,0.050424,4.420547,25.969972,-0.621126,0.373620,0.466517,-0.100117,-0.133767,-0.013487,0.000923,-1.533414,6.951390e-30,-0.057433,0.000305,0.009323,0.001066,0.038771,0.127176,0.040629,0.000055,0.063141,0.000010,-0.045662,0.002215,0.182232,0.000544,-0.128684,0.001742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160010,103982.0,107127.65,4.066301e+09,False,5,1541,ZWS,0.762291,-3.139746e-15,0.002439,3.228418,15.896758,-0.752358,0.577192,0.573424,-0.154147,-0.074996,-0.010378,0.000124,-1.974779,-3.552214e-18,0.288016,0.000002,-0.003889,0.000036,-0.003274,0.001379,-0.021714,0.000026,-0.008208,0.000002,-0.019764,0.001104,0.024043,0.000198,0.047756,0.000524
160011,160370.0,108231.04,4.060776e+09,False,4,1541,ZWS,0.793100,-4.579490e-15,0.001399,3.229276,15.914075,-0.749824,0.576721,0.555407,-0.133274,-0.058564,0.032792,0.000163,-2.177920,-3.552214e-18,-0.411605,0.000003,-0.002980,0.000040,-0.002118,0.001451,-0.022367,0.000028,-0.008702,0.000003,-0.004787,0.001098,-0.038660,0.000224,0.090447,0.000535
160012,1031511.0,117603.93,1.258062e+10,False,3,1541,ZWS,0.767982,-4.548268e-15,0.006788,3.364231,16.524853,-0.747284,0.526691,0.563935,-0.163990,-0.059153,-0.028708,0.000136,-1.868521,-3.552214e-18,0.820707,0.000005,0.037579,0.002156,0.034430,0.045315,-0.023350,0.000031,-0.093385,0.000344,0.008998,0.001116,-0.023602,0.000208,0.327306,0.000546
160013,382418.0,120666.56,1.326214e+10,False,2,1541,ZWS,0.750742,2.858428e-15,0.006570,3.333177,16.262226,-0.744650,0.546938,0.506274,-0.149322,-0.015747,-0.014877,0.000191,-0.462384,-3.552214e-18,7.506477,0.000006,0.030473,0.002933,0.020689,0.052069,-0.024309,0.000034,-0.056829,0.000410,0.021562,0.000969,0.045151,0.000197,0.018425,0.000543


### Modelling with all variables

In the cell below we are iterating over the three lists of parameters:
- horizons - how many hours before the transition are considered to be close enough to be prediction phase
- sizes - share of the positive observations in the whole modelling dataset - this parameter is important because in the original dataset share of positives for some of the horizons was to small, so we decided to use decrease size of the negative dataset and randomize it
- states - in order to avoid lucky random choices in the sizes randomization we use a list of different random states to average the results

In [23]:
# Read dataset
data_logdyn = pd.read_parquet('Data/final_dataset.parquet')

# Choose binary target and other parameters
target = 'Flag'
horizons = list(range(1, 9))
shares = np.linspace(0.05, 0.2, 4)
states = list(range(0, 10000, 500))

# Iterate over the chosen parameters and optimize classification models, then save all the results to the dataframe
res = fun.model(data_logdyn, target, horizons, shares, states)

# OHE-like transformation of the variables' lists
coeffs_cols = data_logdyn.columns.drop(['Volume', 'MA100', 'MV100', 'Rise', 'Distance', 'Index', 'Ticker'])
res_coeffs = pd.DataFrame(columns = list(coeffs_cols) + ['const'])
for row in res['Coeffs']:
    res_coeffs.loc[len(res_coeffs)] = row
res = res.drop(columns = ['Coeffs']).join(res_coeffs)
res.to_parquet('Data/params.parquet')

# Create pivot based on the horizon and 1 share parameters
groups = ['Horizon', '1 Share', '1 Share real']
drops = ['State']
res_means = res.groupby(groups)[res.columns.drop(groups + drops)].mean()
res_means.to_parquet('Data/params_mean.parquet')
res_means

100%|██████████| 8/8 [17:04<00:00, 128.05s/it]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Train size,Test size,Train AUC,Test AUC,Train KS-test p-value,Test KS-test p-value,Train F1-score,Test F1-score,Train precision,Test precision,Train recall,Test recall,Hurst,CorrDim,Lyapunov,Skewness,Kurtosis,PSD,ACF_1,WL_C1,WL_C2,WL_C3,Hurst_8_dyn,Hurst_8_Variance,CorrDim_8_dyn,CorrDim_8_Variance,Lyapunov_8_dyn,Lyapunov_8_Variance,Skewness_8_dyn,Skewness_8_Variance,Kurtosis_8_dyn,Kurtosis_8_Variance,PSD_8_dyn,PSD_8_Variance,ACF_1_8_dyn,ACF_1_8_Variance,WL_C1_8_dyn,WL_C1_8_Variance,WL_C2_8_dyn,WL_C2_8_Variance,WL_C3_8_dyn,WL_C3_8_Variance,const
Horizon,1 Share,1 Share real,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
1,0.05,0.05016,8740.0,2185.0,0.91705,0.9028,0.0,0.0,0.57445,0.59895,0.44475,0.48455,0.8156,0.8005,,1.607584,6.267405,0.806011,-0.041219,0.668487,1.083096,-0.689425,-0.954452,2.429622,-0.766993,-48.750093,8.734625e-16,-2.48641,0.001889,-388.416154,2.28238,-0.431408,-0.134771,0.0007,-1.446676,719.281892,1.639095,273.559374,-1.260763,-44.110917,,-37.053144,,-34.409319,-3.905914
1,0.1,0.100293,4371.0,1093.0,0.91705,0.9046,0.0,0.0,0.7236,0.70725,0.65555,0.64685,0.81155,0.7948,,0.932452,6.184795,0.832796,-0.04373,0.743014,1.263808,-0.924908,-1.114259,2.349438,,-54.762036,1.683301e-15,-3.644923,0.004566,-529.973318,2.551459,-0.627291,-0.149984,0.001167,-1.719918,864.886362,1.81899,356.849388,-1.508813,-54.288192,,-48.962652,,,-3.571782
1,0.15,0.150425,2914.0,729.0,0.91605,0.9133,0.0,0.0,0.7749,0.78225,0.7435,0.7621,0.81005,0.8092,,-1.840861,7.525784,0.865403,-0.04647,0.677284,1.32092,-1.032421,-1.198083,2.536896,,-56.920876,2.936649e-15,-5.261013,,,2.527806,-0.481449,-0.162402,0.001077,-1.874719,1089.417724,1.847678,408.925355,-1.622408,,,-48.844835,,,-3.367581
1,0.2,0.200512,2186.0,547.0,0.91795,0.9122,0.0,0.0,0.81155,0.80115,0.8135,0.78915,0.811,0.81845,-1.362815,1.186848,8.706753,0.856623,-0.046887,0.637647,1.399337,-1.145474,,2.598529,,-63.904992,4.739529e-15,-3.872457,0.00397,-503.927029,2.651688,-0.636803,-0.170072,0.001722,-2.211436,1102.293919,1.820314,494.315938,-1.778574,-50.472005,,-64.909306,,,-3.050465
2,0.05,0.050328,17421.0,4356.0,0.85525,0.8508,0.0,0.0,0.50195,0.5314,0.3958,0.43455,0.69495,0.69735,,,5.949856,0.576943,-0.029305,0.56574,0.780231,-0.533009,-0.75461,1.529387,,-32.037711,,-1.526988,0.000295,-262.077567,2.252509,-0.245854,-0.109087,0.000323,-0.58015,538.538131,1.677715,284.707936,-0.836844,,,-26.606743,,,-3.132078
2,0.1,0.100615,8714.0,2179.0,0.85725,0.8522,0.0,0.0,0.63565,0.6499,0.5833,0.6182,0.7027,0.6963,,2.064734,6.222017,0.607224,-0.03171,0.626832,0.881601,-0.659062,-0.806744,1.516031,,-34.863703,1.090834e-15,-6.85374,0.000381,-273.87235,2.423935,-0.502257,-0.120826,0.000935,-1.278146,681.664206,1.757455,376.414395,-0.999836,,,-32.822317,,-19.320801,-2.799415
2,0.15,0.150881,5811.0,1453.0,0.8579,0.8502,0.0,0.0,0.70105,0.69735,0.70255,0.69835,0.70215,0.70395,,1.706606,6.015912,0.599213,-0.031346,0.61958,0.88754,-0.728862,-0.858354,1.5459,,-37.872832,1.243321e-15,-3.121328,,-331.085645,2.504764,-0.620129,-0.136071,0.001236,-1.136778,739.938617,1.940004,446.850271,-1.065814,-30.389706,,-35.36893,,,-2.564423
2,0.2,0.201101,4360.0,1090.0,0.86165,0.84565,0.0,0.0,0.739,0.7246,0.7811,0.7724,0.7035,0.6861,,2.366407,6.702149,0.585349,-0.030286,0.666391,0.978562,-0.796565,-0.927206,1.4144,-0.563297,-38.679182,3.823556e-15,-5.297013,,-339.202137,2.503397,-0.544827,-0.130046,0.001316,-1.372624,820.220546,1.919745,501.212278,-1.090282,,,-30.653011,,,-2.339029
3,0.05,0.050495,26046.0,6512.0,0.8069,0.80015,0.0,0.0,0.4785,0.4732,0.401,0.40095,0.5991,0.594,,0.909015,4.71659,0.451179,-0.022806,0.476404,0.579283,-0.444694,-0.591065,1.407781,,-26.337869,,-1.815282,,-212.351931,2.379674,-0.185205,-0.10584,-0.000248,-0.492387,479.679233,1.556019,285.88537,-0.915205,-21.156899,,-19.565594,,,-2.737445
3,0.1,0.100933,13030.0,3258.0,0.8097,0.80575,0.0,0.0,0.58905,0.58295,0.5768,0.56215,0.60485,0.61445,,1.115722,4.418842,0.453154,-0.023059,0.508481,0.619484,-0.464519,-0.693387,1.376691,,-27.267071,,-4.294936,,-236.998279,2.526585,-0.382029,-0.11086,0.000433,-0.829317,565.441235,1.53422,375.499152,-1.034082,-24.16733,,-22.147374,0.008278,,-2.391228


In [24]:
# Get mean metrics for all of the columns to understand what variables are actually used in the final models
pd.DataFrame(round(res_means[np.in1d(res_means.index.get_level_values(0), list(range(4,9)))].mean(), 2), columns = ['Average coeff'])
# round(res_means.mean(), 2)

Unnamed: 0,Average coeff
Train size,26860.75
Test size,6715.75
Train AUC,0.72
Test AUC,0.71
Train KS-test p-value,0.0
Test KS-test p-value,0.0
Train F1-score,0.42
Test F1-score,0.42
Train precision,0.42
Test precision,0.42


In [25]:
# Count a share of appearances
res = pd.read_parquet('Data/params.parquet')
res_4_9 = res[res['Horizon'].isin(range(4, 9))]
groups = ['Horizon', '1 Share', '1 Share real']
drops = ['State']
round(res_4_9.groupby(groups)[list(coeffs_cols) + ['const']].count() / 20, 2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Hurst,CorrDim,Lyapunov,Skewness,Kurtosis,PSD,ACF_1,WL_C1,WL_C2,WL_C3,Hurst_8_dyn,Hurst_8_Variance,CorrDim_8_dyn,CorrDim_8_Variance,Lyapunov_8_dyn,Lyapunov_8_Variance,Skewness_8_dyn,Skewness_8_Variance,Kurtosis_8_dyn,Kurtosis_8_Variance,PSD_8_dyn,PSD_8_Variance,ACF_1_8_dyn,ACF_1_8_Variance,WL_C1_8_dyn,WL_C1_8_Variance,WL_C2_8_dyn,WL_C2_8_Variance,WL_C3_8_dyn,WL_C3_8_Variance,const
Horizon,1 Share,1 Share real,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
4,0.05,0.050661,0.0,0.05,1.0,1.0,1.0,1.0,1.0,0.65,1.0,1.0,0.0,1.0,0.0,0.55,0.0,1.0,1.0,0.75,1.0,0.3,0.5,1.0,1.0,1.0,1.0,0.25,0.0,0.1,0.0,0.05,1.0
4,0.1,0.101252,0.0,0.2,1.0,1.0,1.0,1.0,0.85,0.6,0.95,1.0,0.0,0.95,0.0,0.55,0.0,1.0,1.0,0.9,1.0,0.45,0.6,1.0,1.0,1.0,1.0,0.2,0.0,0.05,0.0,0.0,1.0
4,0.15,0.151769,0.0,0.05,0.95,1.0,1.0,1.0,0.85,0.8,0.8,1.0,0.1,1.0,0.05,0.3,0.0,0.85,1.0,0.9,0.95,0.5,0.55,1.0,1.0,1.0,1.0,0.1,0.0,0.05,0.0,0.0,1.0
4,0.2,0.202214,0.0,0.2,0.8,1.0,1.0,1.0,0.6,0.65,0.75,1.0,0.1,0.95,0.0,0.45,0.0,0.8,1.0,0.65,0.85,0.55,0.75,1.0,1.0,1.0,1.0,0.0,0.0,0.05,0.0,0.0,1.0
5,0.05,0.050829,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.6,1.0,1.0,0.2,0.95,0.0,0.35,0.0,1.0,1.0,0.55,1.0,0.65,0.6,1.0,1.0,1.0,1.0,0.35,0.0,0.2,0.0,0.0,1.0
5,0.1,0.101568,0.0,0.05,1.0,1.0,1.0,1.0,0.95,0.75,1.0,1.0,0.05,1.0,0.0,0.6,0.0,1.0,1.0,0.55,1.0,0.65,0.75,1.0,1.0,1.0,1.0,0.2,0.0,0.1,0.0,0.0,1.0
5,0.15,0.152222,0.0,0.05,0.95,1.0,1.0,1.0,0.8,0.65,0.9,1.0,0.05,0.9,0.05,0.5,0.0,0.95,1.0,0.6,1.0,0.55,0.9,1.0,1.0,1.0,1.0,0.35,0.0,0.1,0.0,0.05,1.0
5,0.2,0.202783,0.05,0.1,0.95,1.0,1.0,1.0,0.85,0.75,0.9,1.0,0.15,0.9,0.05,0.4,0.0,0.95,1.0,0.65,0.95,0.65,0.95,1.0,1.0,1.0,1.0,0.25,0.0,0.0,0.0,0.05,1.0
6,0.05,0.050999,0.05,0.0,1.0,1.0,1.0,1.0,1.0,0.75,1.0,1.0,0.55,0.9,0.0,0.5,0.0,1.0,1.0,0.5,1.0,0.6,0.4,1.0,1.0,1.0,1.0,0.45,0.0,0.1,0.0,0.0,1.0
6,0.1,0.10189,0.0,0.0,0.95,1.0,1.0,1.0,0.85,0.7,1.0,1.0,0.45,0.95,0.0,0.55,0.0,1.0,1.0,0.45,1.0,0.65,0.55,1.0,1.0,1.0,1.0,0.4,0.05,0.0,0.0,0.0,1.0


In [26]:
# Count an aggregated share of appearances
pd.DataFrame(round(res_4_9[list(coeffs_cols) + ['const']].count() / len(res_4_9), 2), columns = ['Share'])

Unnamed: 0,Share
Hurst,0.05
CorrDim,0.06
Lyapunov,0.79
Skewness,1.0
Kurtosis,1.0
PSD,1.0
ACF_1,0.85
WL_C1,0.63
WL_C2,0.96
WL_C3,1.0


### Visualization for the KS-test

In [27]:
# Vizual check of the single model
data_logdyn = pd.read_parquet('Data/final_dataset.parquet')
target = 'Flag'
horizon = 8
share = 0.1
state = 2000
data_testing = data_logdyn.copy()
data_testing['Flag'] = data_testing['Distance'].apply(lambda x: 0 if x >= horizon else 1)
data_testing.drop(columns = ['Volume', 'MA100', 'MA100','Rise', 'Distance', 'Index', 'Ticker'], inplace = True)

data_testing_1 = data_testing[data_testing[target] == 1]
data_testing_0 = data_testing[data_testing[target] == 0]
Y_1 = data_testing_1[target]
X_1 = data_testing_1.drop(columns = [target])
share_1_orig = len(data_testing_1) / (len(data_testing_0) + len(data_testing_1))
_, X_0, _, Y_0 = modsel.train_test_split(data_testing_0.drop(columns = [target]), data_testing_0[target], 
                                                     test_size = min(share_1_orig * (1 - share) / share, 1), random_state = state)
share_1 = len(Y_1) / (len(Y_0) + len(Y_1))
Y = pd.concat([Y_0, Y_1])
X = sm.add_constant(pd.concat([X_0, X_1]))
X_train, X_test, Y_train, Y_test = modsel.train_test_split(X, Y, test_size = 0.2, random_state = state)
results_rs, auc_train_rs, auc_test_rs, ks_train_rs, ks_test_rs, f1_train_rs,\
    f1_test_rs, pr_train_rs, pr_test_rs, rec_train_rs, rec_test_rs\
    = fun.model_optimization(Y_train, Y_test, X_train, X_test, silent = True)
print(results_rs.summary())
Y_test_pred = results_rs.predict(X_test)
ks_samples = pd.DataFrame({'Y': Y_test, 'Y_pred': Y_test_pred})
ks_samples_posi = ks_samples[ks_samples['Y'] == 1]['Y_pred']
ks_samples_nega = ks_samples[ks_samples['Y'] == 0]['Y_pred']
fig = go.Figure()
fig.add_trace(go.Histogram(x = ks_samples_posi, name = 'Posi', nbinsx = 30))
fig.add_trace(go.Histogram(x = ks_samples_nega, name = 'Nega', nbinsx = 100))
fig.update_layout(barmode = 'overlay')
fig.update_traces(opacity = 0.75)
fig.show()

                          Probit Regression Results                           
Dep. Variable:                   Flag   No. Observations:                30024
Model:                         Probit   Df Residuals:                    30004
Method:                           MLE   Df Model:                           19
Date:                Wed, 07 Aug 2024   Pseudo R-squ.:                  0.1584
Time:                        14:16:27   Log-Likelihood:                -8320.4
converged:                       True   LL-Null:                       -9886.2
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  -1.7121      0.074    -23.170      0.000      -1.857      -1.567
Lyapunov                3.0276      0.860      3.520      0.000       1.342       4.714
Skewness        

### Modelling with separate variables

In [3]:
# Read dataset
data_logdyn = pd.read_parquet('Data/final_dataset.parquet')

# Choose binary target and other parameters
target = 'Flag'
horizons = list(range(1, 6))
shares = np.linspace(0.05, 0.2, 4)
states = list(range(0, 10000, 500))

# Iterate over the chosen parameters and optimize classification models, then save all the results to the dataframe
res_sep = fun.model(data_logdyn, target, horizons, shares, states, separate = True)
res_sep['Const'] = res_sep['Coeffs'].apply(lambda x: x['const'])
res_sep['Const_Pvalue'] = res_sep['Pvalues'].apply(lambda x: x['const'])
coef = []
coef_p = []
for row in res_sep.itertuples():
    coef.append(row.Coeffs[row.Variable])
    coef_p.append(row.Pvalues[row.Variable])
res_sep['Coef'] = coef
res_sep['Coef_Pvalue'] = coef_p
res_sep.drop(columns = ['Coeffs', 'Pvalues']).to_parquet('Data/params_sep.parquet')

# Create pivot based on the horizon and 1 share parameters
groups = ['Variable', 'Horizon']
drops = ['State', 'Coeffs', '1 Share', '1 Share real']
res_sep_means = res_sep.groupby(groups)[res_sep.columns.drop(groups + drops)].mean()
res_sep_means.to_parquet('Data/params_sep_mean.parquet')
res_sep_means[res_sep_means['Test AUC'] >= 0.75].sort_values('Test AUC', ascending = False)

100%|██████████| 5/5 [16:58<00:00, 203.76s/it]


Unnamed: 0_level_0,Unnamed: 1_level_0,Train size,Test size,Train AUC,Test AUC,Train KS-test p-value,Test KS-test p-value,Train F1-score,Test F1-score,Train precision,Test precision,Train recall,Test recall,Const,Const_Pvalue,Coef,Coef_Pvalue
Variable,Horizon,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Skewness_8_Variance,1,4648.565789,1162.447368,0.940487,0.941184,0.0,0.0,0.703316,0.711487,0.612789,0.625868,0.850697,0.862592,-1.51277,3.296619e-217,0.652842,5.436307e-46
Kurtosis_8_Variance,1,4709.863014,1177.767123,0.936753,0.937918,0.0,0.0,0.666247,0.679959,0.561397,0.583945,0.850932,0.859603,-1.466881,2.058904e-204,0.001631,2.3831820000000003e-43
ACF_1_8_Variance,1,4552.75,1138.5,0.930875,0.931375,0.0,0.0,0.7542,0.738487,0.706313,0.673013,0.8255,0.845675,-1.660177,1.426042e-232,793.14515,4.93482e-73
Skewness_8_Variance,2,9501.5,2375.771429,0.895057,0.896329,0.0,0.0,0.635514,0.614543,0.568829,0.528614,0.7491,0.773514,-1.462848,0.0,0.813546,1.3489950000000002e-84
Kurtosis_8_Variance,2,9918.949153,2480.152542,0.890254,0.890881,0.0,0.0,0.577915,0.598424,0.486559,0.514203,0.759136,0.758661,-1.43851,0.0,0.001947,5.874377e-74
ACF_1_8_Variance,2,9076.5,2269.5,0.882225,0.882237,0.0,0.0,0.649175,0.639925,0.6003,0.584213,0.73,0.747825,-1.558918,0.0,860.292515,4.765923e-118
Skewness_8_Variance,3,14732.015385,3683.446154,0.860585,0.857877,0.0,0.0,0.526231,0.545954,0.439615,0.474785,0.704538,0.697769,-1.429076,0.0,0.765947,1.926762e-104
Kurtosis_8_Variance,3,16676.297872,4169.553191,0.853809,0.850979,0.0,0.0,0.479596,0.488,0.382043,0.393234,0.70183,0.703,-1.460052,0.0,0.001748,3.6093349999999997e-90
ACF_1_8_Variance,3,13572.25,3393.5,0.842025,0.839112,0.0,0.0,0.5547,0.565813,0.494675,0.504663,0.676063,0.683462,-1.477161,0.0,802.732695,1.2669080000000002e-153
Skewness_8_Variance,4,19435.953125,4859.3125,0.820844,0.819078,0.0,0.0,0.469031,0.484,0.393828,0.418391,0.630016,0.624328,-1.367296,0.0,0.64628,1.643859e-108


In [12]:
# Getting highest AUC values for variables
res_sep = pd.read_parquet('Data/params_sep.parquet')
res_sep.groupby(groups)['Test AUC'].mean().to_frame().reset_index(1, drop = False).groupby('Variable')['Test AUC'].max().sort_values(ascending = False).round(2)

Variable
Skewness_8_Variance    0.94
Kurtosis_8_Variance    0.94
ACF_1_8_Variance       0.93
Skewness               0.81
Skewness_8_dyn         0.81
Kurtosis               0.80
Kurtosis_8_dyn         0.77
Lyapunov_8_Variance    0.76
Lyapunov               0.74
CorrDim                0.64
CorrDim_8_Variance     0.58
PSD_8_dyn              0.57
WL_C1_8_dyn            0.57
ACF_1                  0.55
WL_C1                  0.54
PSD_8_Variance         0.54
WL_C2                  0.53
WL_C2_8_Variance       0.53
PSD                    0.52
ACF_1_8_dyn            0.52
Hurst_8_Variance       0.51
WL_C1_8_Variance       0.51
WL_C3                  0.51
Hurst_8_dyn            0.51
WL_C3_8_dyn            0.51
WL_C2_8_dyn            0.51
Hurst                  0.50
WL_C3_8_Variance       0.50
CorrDim_8_dyn          0.50
Lyapunov_8_dyn         0.49
Name: Test AUC, dtype: float64