### Libraries import

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import functions as fun
import plotly.io as pio
import importlib as imp
import statsmodels.api as sm
import plotly.graph_objects as go
import sklearn.model_selection as modsel

import warnings
warnings.filterwarnings("ignore")
pio.templates.default = "plotly_dark"
pd.set_option('display.max_columns', None)

In [18]:
imp.reload(fun)



### Feature generation

In [2]:
# Read dataset and define columns for feature generation
data = pd.read_parquet('Data/dataset.parquet')
indices = data.groupby(['Ticker', 'Index']).size().index.values
cols = ['Hurst', 'CorrDim', 'Lyapunov', 'Variance', 
        'Skewness', 'Kurtosis', 'PSD', 'ACF_1',
        'WL_C1', 'WL_C2', 'WL_C3']

# Set lag for dynamics and short variance calculation
lag_model = [8]

# Calculate dynamics and short variance
# Original idea about variance was born from the largest Lyapunov exponent's behaviour before the critical transition point:
# is mostly didn't move in nominal values but its variance in some cases decreased signigicantly 
data_logdyn = pd.DataFrame()
for ind in tqdm(indices):
    data_ind = data[(data['Ticker'] == ind[0]) & (data['Index'] == ind[1])]
    for col in cols:
        for lag_m in lag_model:
            data_ind[col + '_' + str(lag_m) + '_dyn'] = data_ind[col] / data_ind[col].shift(lag_m) - 1
            data_ind[col + '_' + str(lag_m) + '_Variance'] = data_ind[col].rolling(lag_m).var()
    data_ind.dropna(inplace = True)
    data_logdyn = pd.concat([data_logdyn, data_ind])

# Reset index to get rid of dates and save final dataset
data_logdyn.reset_index(drop = True, inplace = True)
data_logdyn = data_logdyn[data_logdyn['Distance'] > 0]
data_logdyn.to_parquet('Data/final_dataset.parquet')
data_logdyn

100%|██████████| 967/967 [00:40<00:00, 23.93it/s]


Unnamed: 0,Volume,MA100,Rise,Distance,Index,Ticker,Hurst,CorrDim,Lyapunov,Variance,Skewness,Kurtosis,PSD,ACF_1,WL_C1,WL_C2,WL_C3,Hurst_8_dyn,Hurst_8_Variance,CorrDim_8_dyn,CorrDim_8_Variance,Lyapunov_8_dyn,Lyapunov_8_Variance,Variance_8_dyn,Variance_8_Variance,Skewness_8_dyn,Skewness_8_Variance,Kurtosis_8_dyn,Kurtosis_8_Variance,PSD_8_dyn,PSD_8_Variance,ACF_1_8_dyn,ACF_1_8_Variance,WL_C1_8_dyn,WL_C1_8_Variance,WL_C2_8_dyn,WL_C2_8_Variance,WL_C3_8_dyn,WL_C3_8_Variance
0,382075.0,186099.90,False,291,2175,A,0.651958,1.502430e-15,0.005159,9.550465e+09,2.464177,11.200965,-0.771071,0.507490,0.342651,0.022711,0.002669,-0.119301,0.006105,2.031520,9.807346e-31,-4.189741,1.477781e-05,0.072674,6.780709e+14,0.033793,0.000063,-0.005914,0.006870,0.005967,0.000014,-0.030671,0.000010,0.077689,0.000880,-8.733268,0.000429,-1.455061,0.000175
1,292647.0,187908.86,False,290,2175,A,0.639823,-1.060215e-16,0.006704,9.585082e+09,2.445272,11.067568,-0.770524,0.511807,0.286024,-0.026283,-0.003199,0.362476,0.002009,-0.462728,9.704331e-31,-2.510252,1.814968e-05,0.009710,1.774681e+15,-0.013670,0.000217,-0.028593,0.020058,0.005660,0.000015,-0.006026,0.000011,-0.043911,0.000973,1.643379,0.000485,-0.750029,0.000142
2,77650.0,186442.90,False,289,2175,A,0.692141,1.234920e-15,0.002755,9.586808e+09,2.444799,11.063926,-0.770989,0.510281,0.343892,0.019235,0.016757,-0.083908,0.000592,-2.025169,7.012497e-31,-2.145498,1.679701e-05,0.010084,2.591068e+15,-0.013094,0.000337,-0.028612,0.029793,0.007798,0.000015,-0.012033,0.000012,0.030140,0.001035,-3.108573,0.000570,1.116842,0.000160
3,69826.0,185885.98,False,288,2175,A,0.691744,1.381300e-15,0.003034,9.594264e+09,2.448700,11.069584,-0.773628,0.511491,0.296838,-0.009135,0.023979,0.069373,0.000910,0.064045,7.197949e-31,-1.740880,1.192779e-05,0.010890,3.212532e+15,-0.011603,0.000395,-0.028205,0.035370,0.015778,0.000009,-0.009283,0.000012,0.098330,0.000763,-0.748113,0.000410,0.362201,0.000187
4,68277.0,185620.89,False,287,2175,A,0.677108,-3.765637e-15,0.001907,9.605297e+09,2.446302,11.047708,-0.776542,0.512073,0.335905,0.016324,0.000646,0.105165,0.000793,4.109762,2.992368e-30,-1.635121,7.494889e-06,0.012696,3.541439e+15,-0.015236,0.000404,-0.033008,0.036562,0.017787,0.000005,-0.009602,0.000010,-0.021040,0.000724,-6.134960,0.000442,-1.047026,0.000129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282358,305417.0,163643.22,False,5,2923,ZWS,0.640033,-1.075664e-15,0.001451,9.862010e+09,2.750077,10.450696,-0.249480,0.509450,0.426579,-0.014910,-0.024646,0.014039,0.008911,0.136079,3.220794e-30,-1.665425,5.713148e-07,0.002595,2.802480e+16,0.003499,0.000832,0.023218,0.043614,-0.105693,0.000085,0.054352,0.000184,-0.065463,0.000673,-0.651608,0.002305,-0.487456,0.000051
282359,208657.0,164804.55,False,4,2923,ZWS,0.650464,1.428183e-15,0.001590,9.805879e+09,2.767580,10.603502,-0.246640,0.514747,0.377997,-0.090383,-0.032964,0.566612,0.007408,-0.378489,2.574505e-30,20.068661,8.768582e-07,-0.007021,2.640481e+16,0.017627,0.000655,0.052053,0.017863,-0.105187,0.000075,0.044414,0.000244,0.013370,0.000620,-0.092599,0.002211,-0.182851,0.000050
282360,1369475.0,177527.24,False,3,2923,ZWS,0.605225,2.859733e-16,0.006145,1.290870e+10,4.419049,33.761933,-0.245769,0.407302,0.452935,-0.008462,-0.046249,-0.031139,0.007151,-11.689347,2.655114e-30,-10.379825,4.955767e-06,0.343350,1.250351e+18,0.620435,0.342839,2.257314,67.236473,-0.092668,0.000062,-0.171816,0.001229,0.033572,0.000754,-0.400974,0.002288,0.036566,0.000054
282361,680336.0,183571.70,False,2,2923,ZWS,0.623110,-1.956124e-15,0.006337,1.351842e+10,4.314847,31.412914,-0.243810,0.490572,0.421536,-0.108931,-0.027726,0.367783,0.005303,-0.049995,2.618025e-30,15.533128,8.217537e-06,0.416228,2.518010e+18,0.574922,0.548896,1.988899,104.172504,-0.089003,0.000046,0.009844,0.001233,0.040718,0.000743,-0.075623,0.002146,-0.270012,0.000061


### Modelling with all variables

In the cell below we are iterating over the three lists of parameters:
- horizons - how many hours before the transition are considered to be close enough to be prediction phase
- sizes - share of the positive observations in the whole modelling dataset - this parameter is important because in the original dataset share of positives for some of the horizons was to small, so we decided to use decrease size of the negative dataset and randomize it
- states - in order to avoid lucky random choices in the sizes randomization we use a list of different random states to average the results

In [21]:
# Read dataset
data_logdyn = pd.read_parquet('Data/final_dataset.parquet')

# Choose binary target and other parameters
target = 'Flag'
horizons = list(range(1, 9))
shares = np.linspace(0.05, 0.2, 4)
states = list(range(0, 10000, 500))

# Iterate over the chosen parameters and optimize classification models, then save all the results to the dataframe
res = fun.model(data_logdyn, target, horizons, shares, states)

# OHE-like transformation of the variables' lists
coeffs_cols = data_logdyn.columns.drop(['Volume', 'MA100', 'Rise', 'Distance', 'Index', 'Ticker'])
res_coeffs = pd.DataFrame(columns = list(coeffs_cols) + ['const'])
for row in res['Coeffs']:
    res_coeffs.loc[len(res_coeffs)] = row
res = res.drop(columns = ['Coeffs']).join(res_coeffs)
res.to_parquet('Data/params.parquet')

# Create pivot based on the horizon and 1 share parameters
groups = ['Horizon', '1 Share', '1 Share real']
drops = ['State']
res_means = res.groupby(groups)[res.columns.drop(groups + drops)].mean()
res_means.to_parquet('Data/params_mean.parquet')
res_means

100%|██████████| 8/8 [48:20<00:00, 362.52s/it]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Train size,Test size,Train AUC,Test AUC,Train KS-test p-value,Test KS-test p-value,Train F1-score,Test F1-score,Train precision,Test precision,Train recall,Test recall,Hurst,CorrDim,Lyapunov,Variance,Skewness,Kurtosis,PSD,ACF_1,WL_C1,WL_C2,WL_C3,Hurst_8_dyn,Hurst_8_Variance,CorrDim_8_dyn,CorrDim_8_Variance,Lyapunov_8_dyn,Lyapunov_8_Variance,Variance_8_dyn,Variance_8_Variance,Skewness_8_dyn,Skewness_8_Variance,Kurtosis_8_dyn,Kurtosis_8_Variance,PSD_8_dyn,PSD_8_Variance,ACF_1_8_dyn,ACF_1_8_Variance,WL_C1_8_dyn,WL_C1_8_Variance,WL_C2_8_dyn,WL_C2_8_Variance,WL_C3_8_dyn,WL_C3_8_Variance,const
Horizon,1 Share,1 Share real,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
1,0.05,0.050163,15421.0,3856.0,0.92325,0.9182,0.0,0.0,0.58595,0.5955,0.4553,0.4724,0.8268,0.81945,-2.836461,-15.740239,-3.663509,-5.97797e-14,-0.008667,-0.001192,0.546712,-1.026306,-0.914697,1.372044,-1.226807,0.397427,-38.748897,-2.563898e-14,-41.048173,-0.000346,,10.21382,-7.132355e-25,-2.538539,0.86986,0.23851,-0.002887,,,1.197481,180.503297,,-46.251086,,,,41.30908,-2.548671
1,0.1,0.100301,7712.0,1929.0,0.9203,0.91455,0.0,0.0,0.7045,0.72695,0.62125,0.6615,0.82,0.8153,-2.267233,,-4.807284,-6.136542e-14,-0.095644,0.005965,0.571657,-1.17025,-1.211351,1.527135,-1.416064,,-47.029956,-2.291864e-15,-74.854543,-0.002468,-971.009916,11.954201,-1.0828119999999999e-24,-4.121003,0.965052,0.469343,-0.003097,,-4867.168365,-0.69172,277.288506,,-127.932527,,-161.455716,,36.383996,-2.025609
1,0.15,0.150436,5142.0,1286.0,0.92165,0.9129,0.0,0.0,0.77315,0.77655,0.7297,0.7451,0.8238,0.8146,-1.956015,-9.201169,-5.869538,-7.997322e-14,-0.134507,0.007855,0.674772,-1.123908,-1.208141,1.582809,-1.338915,0.662518,-207.861858,-1.693265e-14,-140.293027,0.001873,-645.172293,12.548694,-9.032075000000001e-25,-4.58538,1.251828,0.511425,-0.004205,-2.030994,-4236.787652,-0.517108,313.837209,,-184.217892,,-162.375541,,48.875164,-1.642316
1,0.2,0.200539,3857.0,965.0,0.91245,0.90955,0.0,0.0,0.80395,0.8009,0.80285,0.7932,0.8062,0.81245,-2.3633,,-5.300242,-2.83648e-13,-0.175833,0.010362,0.683962,-0.094081,-1.63819,4.38697,8.2061,0.29899,-189.46098,-1.804449e-14,-68.078046,0.000506,-560.922123,12.378002,-1.010974e-24,-4.61267,0.656525,0.576783,-0.002939,-1.173069,-5504.506114,-0.79343,344.246442,,-273.813473,,-207.540727,,32.497903,-1.358287
2,0.05,0.050328,30742.0,7686.0,0.87585,0.87535,0.0,0.0,0.49835,0.48405,0.38,0.36325,0.7345,0.73975,-0.958572,-1.068177,-3.338986,-4.836858e-14,0.041115,-0.002833,0.4935,-0.91592,-1.101189,1.912688,-0.538439,0.054147,-36.906493,-3.386951e-16,-57.726519,,-976.070953,10.360665,1.308142e-25,-3.337837,0.856447,0.318641,-0.003489,,-965.639971,-0.310206,165.538675,0.54204,-69.130642,,-117.740482,,,-2.308333
2,0.1,0.100619,15376.0,3845.0,0.8824,0.8814,0.0,0.0,0.6311,0.6468,0.54585,0.57065,0.75235,0.75305,-2.359455,-2.190234,-4.049061,,0.162124,-0.008641,0.393995,-0.655444,-0.901613,3.111386,0.410131,0.199839,-69.780617,-9.404227e-15,239.807269,,-745.278788,10.956299,-5.966207e-25,-4.257321,1.211338,0.485366,-0.00386,,-385.904679,0.407334,238.377527,-0.572965,-79.19088,,-102.606772,-0.002324,2.607456,-2.022419
2,0.15,0.15087,10255.0,2564.0,0.8833,0.8848,0.0,0.0,0.69755,0.7015,0.65005,0.65905,0.75495,0.75975,,-7.829393,-4.93494,,0.086653,-0.004389,0.549032,-0.093965,-1.219051,2.73542,0.041297,-0.44572,-26.513721,-1.47231e-14,-29.423162,,-413.079547,11.800358,-8.794344e-25,-5.088401,1.467042,0.561569,-0.005145,-0.575988,-75.480833,-0.326414,290.615919,,-80.992808,,-88.390201,-0.004598,27.381141,-1.761593
2,0.2,0.201102,7693.0,1924.0,0.8801,0.8804,0.0,0.0,0.7401,0.7399,0.73305,0.7272,0.74995,0.7575,-1.772086,,-4.433529,,0.091106,-0.004156,0.531595,-0.020288,-0.768234,0.87544,0.146238,,-23.940715,-7.600844e-15,-35.047779,,-399.71736,12.232449,,-5.998537,1.481429,0.755312,-0.004872,-0.762209,-1249.22338,-0.465761,343.230958,-0.707674,-140.938096,,-114.798795,-0.002767,30.905441,-1.579822
3,0.05,0.050494,45961.0,11491.0,0.84995,0.8471,0.0,0.0,0.41305,0.4331,0.2931,0.31705,0.70165,0.69385,,,-2.322858,,0.193622,-0.010947,0.260088,,0.336089,,0.653786,-0.232547,,-2.941765e-15,,,,9.391092,,-3.407655,0.632179,0.405303,-0.002843,,432.556962,,171.565764,-0.465568,-27.017666,,-19.543474,,,-2.177392
3,0.1,0.100936,22992.0,5749.0,0.84,0.83995,0.0,0.0,0.55405,0.5571,0.4702,0.47105,0.6787,0.6892,-1.739078,-4.19665,-2.543668,,-0.005324,0.000248,0.423505,-0.511136,-0.821454,0.833875,-0.571585,-0.174397,-23.902223,-5.271081e-15,151.569987,,-735.797076,10.611422,,-4.549995,1.115655,0.536842,-0.004074,-0.29084,145.601797,-0.611545,289.100108,-0.490151,-52.184312,,-87.865205,-0.002871,32.194458,-1.813675


In [22]:
# Get mean metrics for all of the columns to understand what variables are actually used in the final models
pd.DataFrame(round(res_means[np.in1d(res_means.index.get_level_values(0), list(range(4,9)))].mean(), 2), columns = ['Average coeff'])
# round(res_means.mean(), 2)

Unnamed: 0,Average coeff
Train size,47398.8
Test size,11850.15
Train AUC,0.75
Test AUC,0.75
Train KS-test p-value,0.0
Test KS-test p-value,0.0
Train F1-score,0.42
Test F1-score,0.42
Train precision,0.35
Test precision,0.36


In [23]:
# Count a share of appearances
res = pd.read_parquet('Data/params.parquet')
res_4_9 = res[res['Horizon'].isin(range(4, 9))]
groups = ['Horizon', '1 Share', '1 Share real']
drops = ['State']
round(res_4_9.groupby(groups)[list(coeffs_cols) + ['const']].count() / 20, 2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Hurst,CorrDim,Lyapunov,Variance,Skewness,Kurtosis,PSD,ACF_1,WL_C1,WL_C2,WL_C3,Hurst_8_dyn,Hurst_8_Variance,CorrDim_8_dyn,CorrDim_8_Variance,Lyapunov_8_dyn,Lyapunov_8_Variance,Variance_8_dyn,Variance_8_Variance,Skewness_8_dyn,Skewness_8_Variance,Kurtosis_8_dyn,Kurtosis_8_Variance,PSD_8_dyn,PSD_8_Variance,ACF_1_8_dyn,ACF_1_8_Variance,WL_C1_8_dyn,WL_C1_8_Variance,WL_C2_8_dyn,WL_C2_8_Variance,WL_C3_8_dyn,WL_C3_8_Variance,const
Horizon,1 Share,1 Share real,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
4,0.05,0.050661,0.05,0.0,0.65,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.75,0.25,0.0,0.1,0.15,0.0,0.0,1.0,0.0,1.0,0.95,1.0,1.0,0.0,1.0,0.0,0.95,0.95,1.0,0.0,0.7,0.0,0.0,1.0
4,0.1,0.101251,0.05,0.05,0.55,0.0,0.95,0.95,0.95,0.05,0.1,0.05,0.35,0.25,0.05,0.05,0.15,0.0,0.05,1.0,0.15,1.0,1.0,1.0,1.0,0.15,1.0,0.3,1.0,1.0,1.0,0.0,0.5,0.0,0.05,0.85
4,0.15,0.15177,0.05,0.05,0.5,0.0,0.9,0.9,0.9,0.1,0.15,0.15,0.5,0.15,0.1,0.05,0.15,0.05,0.1,1.0,0.05,1.0,0.9,1.0,0.9,0.1,1.0,0.35,0.95,1.0,1.0,0.0,0.4,0.05,0.1,0.75
4,0.2,0.202217,0.0,0.05,0.55,0.0,1.0,0.95,1.0,0.2,0.2,0.1,0.35,0.15,0.2,0.1,0.15,0.05,0.0,1.0,0.05,1.0,0.9,1.0,0.95,0.15,0.95,0.45,1.0,1.0,0.9,0.0,0.35,0.0,0.05,0.8
5,0.05,0.050829,0.0,0.0,0.8,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.6,0.2,0.0,0.0,0.2,0.0,0.0,1.0,0.05,1.0,1.0,1.0,1.0,0.05,1.0,0.05,1.0,1.0,1.0,0.0,0.6,0.0,0.0,1.0
5,0.1,0.101569,0.0,0.0,0.45,0.0,0.95,0.95,0.95,0.0,0.0,0.0,0.5,0.25,0.0,0.0,0.1,0.0,0.05,1.0,0.0,1.0,1.0,1.0,1.0,0.15,1.0,0.0,0.95,1.0,1.0,0.0,0.6,0.0,0.05,0.95
5,0.15,0.152221,0.0,0.0,0.5,0.0,0.9,0.9,0.95,0.1,0.05,0.1,0.65,0.1,0.05,0.1,0.2,0.0,0.05,1.0,0.0,1.0,0.9,1.0,0.95,0.25,1.0,0.2,1.0,1.0,1.0,0.0,0.6,0.0,0.05,0.85
5,0.2,0.202785,0.0,0.0,0.4,0.0,0.95,0.9,0.95,0.15,0.15,0.1,0.55,0.05,0.15,0.05,0.0,0.05,0.1,1.0,0.1,1.0,0.85,1.0,0.95,0.25,1.0,0.5,1.0,1.0,0.95,0.0,0.55,0.0,0.05,0.75
6,0.05,0.050999,0.05,0.0,0.7,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.95,0.35,0.0,0.0,0.4,0.0,0.0,1.0,0.15,1.0,1.0,1.0,1.0,0.1,1.0,0.05,1.0,1.0,1.0,0.0,0.75,0.0,0.0,1.0
6,0.1,0.10189,0.15,0.0,0.6,0.0,0.9,0.9,0.9,0.05,0.0,0.05,0.9,0.25,0.05,0.1,0.35,0.1,0.15,1.0,0.0,1.0,1.0,1.0,1.0,0.15,1.0,0.05,1.0,1.0,1.0,0.0,0.75,0.0,0.05,0.9


In [24]:
# Count an aggregated share of appearances
pd.DataFrame(round(res_4_9[list(coeffs_cols) + ['const']].count() / len(res_4_9), 2), columns = ['Share'])

Unnamed: 0,Share
Hurst,0.06
CorrDim,0.01
Lyapunov,0.71
Variance,0.0
Skewness,0.96
Kurtosis,0.96
PSD,0.97
ACF_1,0.06
WL_C1,0.09
WL_C2,0.04


### Visualization for the KS-test

In [7]:
# Vizual check of the single model
data_logdyn = pd.read_parquet('Data/final_dataset.parquet')
target = 'Flag'
horizon = 8
share = 0.1
state = 2000
data_testing = data_logdyn.copy()
data_testing['Flag'] = data_testing['Distance'].apply(lambda x: 0 if x >= horizon else 1)
data_testing.drop(columns = ['Volume', 'MA100', 'Rise', 'Distance', 'Index', 'Ticker'], inplace = True)

data_testing_1 = data_testing[data_testing[target] == 1]
data_testing_0 = data_testing[data_testing[target] == 0]
Y_1 = data_testing_1[target]
X_1 = data_testing_1.drop(columns = [target])
share_1_orig = len(data_testing_1) / (len(data_testing_0) + len(data_testing_1))
_, X_0, _, Y_0 = modsel.train_test_split(data_testing_0.drop(columns = [target]), data_testing_0[target], 
                                                     test_size = min(share_1_orig * (1 - share) / share, 1), random_state = state)
share_1 = len(Y_1) / (len(Y_0) + len(Y_1))
Y = pd.concat([Y_0, Y_1])
X = sm.add_constant(pd.concat([X_0, X_1]))
X_train, X_test, Y_train, Y_test = modsel.train_test_split(X, Y, test_size = 0.2, random_state = state)
results_rs, auc_train_rs, auc_test_rs, ks_train_rs, ks_test_rs, f1_train_rs,\
    f1_test_rs, pr_train_rs, pr_test_rs, rec_train_rs, rec_test_rs\
    = fun.model_optimization(Y_train, Y_test, X_train, X_test, silent = True)
print(results_rs.summary())
Y_test_pred = results_rs.predict(X_test)
ks_samples = pd.DataFrame({'Y': Y_test, 'Y_pred': Y_test_pred})
ks_samples_posi = ks_samples[ks_samples['Y'] == 1]['Y_pred']
ks_samples_nega = ks_samples[ks_samples['Y'] == 0]['Y_pred']
fig = go.Figure()
fig.add_trace(go.Histogram(x = ks_samples_posi, name = 'Posi'))
fig.add_trace(go.Histogram(x = ks_samples_nega, name = 'Nega'))
fig.update_layout(barmode = 'overlay')
fig.update_traces(opacity = 0.75)
fig.show()

                          Probit Regression Results                           
Dep. Variable:                   Flag   No. Observations:                52980
Model:                         Probit   Df Residuals:                    52962
Method:                           MLE   Df Model:                           17
Date:                Sun, 10 Mar 2024   Pseudo R-squ.:                  0.1445
Time:                        11:11:42   Log-Likelihood:                -14897.
converged:                       True   LL-Null:                       -17413.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  -1.4468      0.057    -25.273      0.000      -1.559      -1.335
Lyapunov               -2.5608      0.684     -3.746      0.000      -3.901      -1.221
Skewness        

### Modelling with separate vaiables

In [25]:
# Read dataset
data_logdyn = pd.read_parquet('Data/final_dataset.parquet')

# Choose binary target and other parameters
target = 'Flag'
horizons = list(range(1, 6))
shares = np.linspace(0.05, 0.2, 4)
states = list(range(0, 10000, 500))

# Iterate over the chosen parameters and optimize classification models, then save all the results to the dataframe
res_sep = fun.model(data_logdyn, target, horizons, shares, states, separate = True)
res_sep['Const'] = res_sep['Coeffs'].apply(lambda x: x['const'])
res_sep['Const_Pvalue'] = res_sep['Pvalues'].apply(lambda x: x['const'])
coef = []
coef_p = []
for row in res_sep.itertuples():
    coef.append(row.Coeffs[row.Variable])
    coef_p.append(row.Pvalues[row.Variable])
res_sep['Coef'] = coef
res_sep['Coef_Pvalue'] = coef_p
res_sep.drop(columns = ['Coeffs', 'Pvalues']).to_parquet('Data/params_mean.parquet')

# Create pivot based on the horizon and 1 share parameters
groups = ['Variable', 'Horizon']
drops = ['State', 'Coeffs']
res_sep_means = res_sep.groupby(groups)[res_sep.columns.drop(groups + drops)].mean()
res_sep_means.to_parquet('Data/params_sep_mean.parquet')
res_sep_means.sort_values('Test AUC', ascending = False).head(25)

100%|██████████| 5/5 [26:38<00:00, 319.64s/it]


Unnamed: 0_level_0,Unnamed: 1_level_0,1 Share,1 Share real,Train size,Test size,Train AUC,Test AUC,Train KS-test p-value,Test KS-test p-value,Train F1-score,Test F1-score,Train precision,Test precision,Train recall,Test recall,Const,Const_Pvalue,Coef,Coef_Pvalue
Variable,Horizon,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Variance_8_dyn,1,0.125949,0.126312,7939.481013,1985.620253,0.932203,0.931709,0.0,0.0,0.739532,0.735051,0.672405,0.66143,0.839747,0.848329,-1.785931,0.0,8.291762,9.685841999999999e-166
Skewness_8_Variance,1,0.122667,0.12302,8225.733333,2057.186667,0.917947,0.918227,0.0,0.0,0.611813,0.620987,0.50204,0.514893,0.82432,0.82572,-1.394306,0.0,1.299072,3.550993e-48
Kurtosis_8_Variance,1,0.121324,0.121674,8354.220588,2089.308824,0.910441,0.911838,0.0,0.0,0.589824,0.588029,0.481529,0.478015,0.809074,0.821191,-1.357841,3.624606e-310,0.004013344,6.10903e-29
ACF_1_8_Variance,1,0.124675,0.125034,8095.571429,2024.636364,0.909429,0.909494,0.0,0.0,0.609026,0.631753,0.504195,0.53761,0.814091,0.811649,-1.551551,0.0,1413.674,2.419075e-126
Variance_8_dyn,2,0.125,0.12573,16016.5,4004.75,0.886262,0.88935,0.0,0.0,0.631938,0.646713,0.550125,0.571,0.776738,0.777675,-1.643001,0.0,8.330796,1.5142099999999999e-266
Skewness_8_Variance,2,0.119531,0.120231,17176.9375,4294.84375,0.877375,0.878891,0.0,0.0,0.521203,0.531734,0.421672,0.432984,0.742656,0.751203,-1.363297,0.0,1.082954,2.4139029999999997e-130
Kurtosis_8_Variance,2,0.119355,0.120054,17111.112903,4278.387097,0.869677,0.870226,0.0,0.0,0.491081,0.515097,0.37929,0.411403,0.759548,0.748097,-1.318958,0.0,0.002397945,5.084946e-59
ACF_1_8_Variance,2,0.125,0.125729,16151.302632,4038.434211,0.863066,0.863079,0.0,0.0,0.540829,0.550474,0.445487,0.460474,0.734408,0.734632,-1.468496,0.0,1370.217,8.530757e-215
Variance_8_dyn,3,0.125,0.126102,23949.25,5988.25,0.843625,0.843062,0.0,0.0,0.556263,0.567575,0.473425,0.4906,0.712538,0.711375,-1.527528,0.0,7.80043,0.0
Skewness_8_Variance,3,0.113281,0.114293,26401.625,6601.328125,0.836641,0.836359,0.0,0.0,0.447547,0.451234,0.345438,0.350781,0.690734,0.694516,-1.354372,0.0,1.034364,7.854925e-117
