In [1]:
import operator
import numpy as np
import scipy as sp
import pandas as pd
from tqdm import tqdm
import plotly.io as pio
import statsmodels.api as sm
import plotly.graph_objects as go
import sklearn.metrics as metrics
import sklearn.model_selection as modsel

import warnings
warnings.filterwarnings("ignore")
pio.templates.default = "plotly_dark"

In [2]:
def roc_metric(Y, Y_pred):
    """
    Function for the calculation of AUC metric

    Inputs:
    ----------
    Y : DataFrame
        Set of Y for the model
    Y_pred : DataFrame
        Set of predicted Y for the model

    Returns:
    ----------
    
    """

    fpr, tpr, thresholds = metrics.roc_curve(Y, Y_pred, pos_label=1)
    auc = round(metrics.auc(fpr, tpr), 3)
    optimal_index = np.argmax(tpr - fpr)

    return auc, thresholds[optimal_index]

#---------------------------------------------------------------------------------------

def remove_most_insignificant(X, X_test, results):
    """
    Function for the removal of the most insignificant variables from the model

    Inputs:
    ----------
    X : DataFrame
        Set of X for the model
    results : model
        Fitted statsmodels model

    Returns:
    ----------
    X_val : DataFrame
        Optimized set of X for the validation of the model
    X_test : DataFrame
        Optimized set of X for the testing of the model
    """
    
    # Use operator to find the key which belongs to the maximum value in the dictionary
    max_p_value = max(results.pvalues.iteritems(), key = operator.itemgetter(1))[0]
    # Drop the worst feature
    X.drop(columns = max_p_value, inplace = True)
    X_test.drop(columns = max_p_value, inplace = True)

    return X, X_test

#---------------------------------------------------------------------------------------

def model_optimization(Y_train,
                       Y_test,
                       X_train,
                       X_test,
                       type:str = 'Probit', 
                       p_value_bord:float = 0.05, 
                       silent:bool = False):
    """
    Function for the optimization of OLS

    Inputs:
    ----------
    Y : array
        Target variable for the regression
    X : DataFrame
        Set of X for the model
    type : str = 'Probit'
        Type of the model
    p_value_bord : float = 0.05
        Maximum acceptable p-value for the coefficient
    silent : bool = False
        Whether not to show reports about model

    Returns:
    ----------
    results : model
        Fitted statsmodels model
    val_rmse : float
        RMSE score for the validation
    test_rmse : float
        RMSE score for the test
    Y_val_pred : array
        Prediction for the validation
    Y_test_pred : array
        Prediction for the test
    """
    
    insignificant_feature = True
    while insignificant_feature:
        # Create model
        if type == 'Probit':
            model = sm.Probit(Y_train, X_train)
        else:
            model = sm.Logit(Y_train, X_train)

        # Fit model and get
        results = model.fit(disp = 0)
        significant = [p_value < p_value_bord for p_value in results.pvalues]
        if all(significant):
            insignificant_feature = False
        else:
            # If there's only one insignificant variable left
            if X_train.shape[1] == 1:
                print('No significant features found')
                results = None
                insignificant_feature = False
            else:
                X_train, X_test = remove_most_insignificant(X_train, X_test, results)
    
    Y_train_pred = results.predict(X_train)
    Y_test_pred = results.predict(X_test)
    auc_train, threshold_train = roc_metric(Y_train, Y_train_pred)
    auc_test, threshold_test = roc_metric(Y_test, Y_test_pred)
    Y_train_pred_round = np.where(Y_train_pred < threshold_train, np.floor(Y_train_pred), np.ceil(Y_train_pred))
    Y_test_pred_round = np.where(Y_test_pred < threshold_test, np.floor(Y_test_pred), np.ceil(Y_test_pred))

    ks_samples_train = pd.DataFrame({'Y': Y_train, 'Y_pred': Y_train_pred})
    ks_samples_train_posi = ks_samples_train[ks_samples_train['Y'] == 1]['Y_pred']
    ks_samples_train_nega = ks_samples_train[ks_samples_train['Y'] == 0]['Y_pred']
    ks_train = sp.stats.kstest(ks_samples_train_posi, ks_samples_train_nega)
    ks_samples_test = pd.DataFrame({'Y': Y_test, 'Y_pred': Y_test_pred})
    ks_samples_test_posi = ks_samples_test[ks_samples_test['Y'] == 1]['Y_pred']
    ks_samples_test_nega = ks_samples_test[ks_samples_test['Y'] == 0]['Y_pred']
    ks_test = sp.stats.kstest(ks_samples_test_posi, ks_samples_test_nega)

    f1_train = round(metrics.f1_score(Y_train, Y_train_pred_round), 3)
    f1_test = round(metrics.f1_score(Y_test, Y_test_pred_round), 3)
    pr_train = round(metrics.precision_score(Y_train, Y_train_pred_round), 3)
    pr_test = round(metrics.precision_score(Y_test, Y_test_pred_round), 3)
    rec_train = round(metrics.recall_score(Y_train, Y_train_pred_round), 3)
    rec_test = round(metrics.recall_score(Y_test, Y_test_pred_round), 3)
    if silent == False:
        print(f'''Train AUC score: {auc_train}, Train KS-test p-value: {round(ks_train.pvalue, 3)}, 
              Train F1-score: {f1_train}, Train precision: {pr_train}, Train recall: {rec_train}''')
        print(f'''Test AUC score: {auc_test}, Test KS-test p-value: {round(ks_test.pvalue, 3)}, 
              Test F1-score: {f1_test}, Test precision: {pr_test}, Test recall: {rec_test}''')
        print(results.summary())

    return results, auc_train, auc_test, round(ks_train.pvalue, 9), round(ks_test.pvalue, 9),\
           f1_train, f1_test, pr_train, pr_test, rec_train, rec_test

In [9]:
# Read dataset and define columns for feature generation
data = pd.read_parquet('Data/dataset.parquet')
indices = data.groupby(['Ticker', 'Index']).size().index.values
cols = ['Hurst', 'Correlation Dimension', 'Lyapunov', 'Variance', 'PSD', 'ACF_1']

# Set lag for dynamics and short variance calculation
lag_model = [8]

# Calculate dynamics and short variance
# Original idea about variance was born from the largest Lyapunov exponent's behaviour before the critical transition point:
# is mostly didn't move in nominal values but its variance in some cases decreased signigicantly 
data_logdyn = pd.DataFrame()
for ind in tqdm(indices):
    data_ind = data[(data['Ticker'] == ind[0]) & (data['Index'] == ind[1])]
    for col in cols:
        for lag_m in lag_model:
            data_ind[col + '_' + str(lag_m) + '_dyn'] = data_ind[col] / data_ind[col].shift(lag_m) - 1
            data_ind[col + '_' + str(lag_m) + '_Variance'] = data_ind[col].rolling(lag_m).var()
    data_ind.dropna(inplace = True)
    data_logdyn = pd.concat([data_logdyn, data_ind])

# Reset index to get rid of dates and save final dataset
data_logdyn.reset_index(drop = True, inplace = True)
data_logdyn = data_logdyn[data_logdyn['Distance'] > 0]
data_logdyn.to_parquet('Data/final_dataset.parquet')
data_logdyn

100%|██████████| 876/876 [00:57<00:00, 15.23it/s]


Unnamed: 0,Volume,MA100,Rise,Distance,Index,Ticker,Hurst,Correlation Dimension,Lyapunov,Variance,...,Correlation Dimension_8_dyn,Correlation Dimension_8_Variance,Lyapunov_8_dyn,Lyapunov_8_Variance,Variance_8_dyn,Variance_8_Variance,PSD_8_dyn,PSD_8_Variance,ACF_1_8_dyn,ACF_1_8_Variance
0,57509.0,126263.46,False,691,2175,A,0.403880,-4.682077e-17,0.005964,1.083606e+10,...,-0.552242,3.655000e-31,0.103982,0.000002,0.001801,9.397053e+14,0.014422,0.000007,0.008936,0.000008
1,123558.0,126543.77,False,690,2175,A,0.390738,-6.140047e-17,0.008423,1.084346e+10,...,-1.037909,8.006177e-33,0.323594,0.000002,0.001019,1.065405e+15,0.014569,0.000009,0.010146,0.000011
2,193237.0,127637.89,False,689,2175,A,0.428623,7.873735e-18,0.008117,1.082838e+10,...,-1.031994,3.306580e-33,0.394990,0.000002,0.001949,1.172292e+15,0.015202,0.000011,0.010270,0.000013
3,126036.0,126869.77,False,688,2175,A,0.342863,1.535563e-15,0.005302,1.080689e+10,...,-28.197117,3.119434e-31,-0.222059,0.000003,0.002796,1.073645e+15,0.015772,0.000012,0.011518,0.000011
4,112081.0,126835.09,False,687,2175,A,0.401636,1.946797e-16,0.006881,1.078341e+10,...,3.266904,3.098272e-31,0.116824,0.000003,0.003101,6.878592e+14,0.015292,0.000011,0.016439,0.000006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649061,305417.0,163643.22,False,5,2923,ZWS,0.460507,5.501340e-16,0.004526,8.417522e+09,...,0.435954,1.033830e-30,22.548304,0.000004,0.311467,5.357205e+17,0.056926,0.000076,0.311943,0.000895
649062,208657.0,164804.55,False,4,2923,ZWS,0.444376,5.657263e-16,0.005613,8.520107e+09,...,-530.228929,1.004270e-30,3.119645,0.000005,0.267957,5.880671e+17,0.056129,0.000077,0.199230,0.001051
649063,1369475.0,177527.24,False,3,2923,ZWS,0.346221,6.006133e-16,0.003566,8.518577e+09,...,-20.497926,9.582942e-31,0.033529,0.000005,0.256454,5.433244e+17,0.056779,0.000079,0.167165,0.001152
649064,680336.0,183571.70,False,2,2923,ZWS,0.395942,-4.496695e-18,0.013285,1.363991e+10,...,-1.210408,9.625008e-31,11.603673,0.000019,0.997457,4.358847e+18,0.057209,0.000080,-0.229030,0.003798


In the cell below we are iterating over the three lists of parameters:
- horizons - how many hours before the transition are considered to be close enough to be prediction phase
- sizes - share of the positive observations in the whole modelling dataset - this parameter is important because in the original dataset share of positives for some of the horizons was to small, so we dicided to use decrease size of the negative dataset and randomize it
- states - in order to avoid lucky random choices in the sizes randomization we are use a list of different random states to average the results

In [14]:
# Read dataset
data_logdyn = pd.read_parquet('Data/final_dataset.parquet')

# Choose binary target and other parameters
target = 'Flag'
horizons = list(range(2, 9))
shares = np.linspace(0.05, 0.2, 4)
states = list(range(0, 10000, 500))
res = pd.DataFrame(columns = ['Horizon', '1 Share', '1 Share real', 'State',
                              'Train size', 'Test size', 'Train AUC', 'Test AUC',
                              'Train KS-test p-value', 'Test KS-test p-value',
                              'Train F1-score', 'Test F1-score', 
                              'Train precision', 'Test precision', 
                              'Train recall', 'Test recall', 'Variables'])
for horizon in tqdm(horizons):
    data_testing = data_logdyn.copy()
    data_testing['Flag'] = data_testing['Distance'].apply(lambda x: 0 if x >= horizon else 1)
    data_testing.drop(columns = ['Volume', 'MA100', 'Rise', 'Distance', 'Index', 'Ticker'], inplace = True)
    
    data_testing_1 = data_testing[data_testing[target] == 1]
    data_testing_0 = data_testing[data_testing[target] == 0]
    Y_1 = data_testing_1[target]
    X_1 = data_testing_1.drop(columns = [target])
    share_1_orig = len(data_testing_1) / (len(data_testing_0) + len(data_testing_1))
    for share in shares:
        for state in states:
            _, X_0, _, Y_0 = modsel.train_test_split(data_testing_0.drop(columns = [target]), data_testing_0[target], 
                                                     test_size = min(share_1_orig * (1 - share) / share, 1), random_state = state)
            share_1 = len(Y_1) / (len(Y_0) + len(Y_1))
            Y = pd.concat([Y_0, Y_1])
            X = sm.add_constant(pd.concat([X_0, X_1]))
            X_train, X_test, Y_train, Y_test = modsel.train_test_split(X, Y, test_size = 0.2, random_state = state)
            results_rs, auc_train_rs, auc_test_rs, ks_train_rs, ks_test_rs, f1_train_rs,\
                f1_test_rs, pr_train_rs, pr_test_rs, rec_train_rs, rec_test_rs\
                = model_optimization(Y_train, Y_test, X_train, X_test, silent = True)
            res.loc[len(res)] = [horizon, share, share_1, state, len(Y_train), len(Y_test),
                                 auc_train_rs, auc_test_rs, ks_train_rs, ks_test_rs,
                                 f1_train_rs, f1_test_rs, pr_train_rs, pr_test_rs,
                                 rec_train_rs, rec_test_rs, list(results_rs.params.index)]

res_counts = res['Variables'].to_frame()
for col in list(X_1.columns) + ['const']:
    res_counts[col] = res_counts['Variables'].apply(lambda x: 1 if col in x else 0)
res = res.drop(columns = ['Variables']).join(res_counts.drop(columns = ['Variables']))
res.to_parquet('Data/params.parquet')

groups = ['Horizon', '1 Share', '1 Share real']
drops = ['State']
res_means = res.groupby(groups)[res.columns.drop(groups + drops)].mean()
res_means.to_parquet('Data/params_mean.parquet')
res_means

100%|██████████| 7/7 [20:37<00:00, 176.82s/it]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Train size,Test size,Train AUC,Test AUC,Train KS-test p-value,Test KS-test p-value,Train F1-score,Test F1-score,Train precision,Test precision,...,Correlation Dimension_8_Variance,Lyapunov_8_dyn,Lyapunov_8_Variance,Variance_8_dyn,Variance_8_Variance,PSD_8_dyn,PSD_8_Variance,ACF_1_8_dyn,ACF_1_8_Variance,const
Horizon,1 Share,1 Share real,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2,0.05,0.050069,13996.0,3500.0,0.85205,0.8469,0.0,0.0,0.42765,0.41515,0.3082,0.2953,...,0.15,0.1,0.05,1.0,0.0,0.0,0.7,1.0,0.4,1.0
2,0.1,0.100126,6999.0,1750.0,0.8532,0.8476,0.0,0.0,0.5653,0.5753,0.4667,0.4888,...,0.2,0.0,0.05,1.0,0.05,0.05,0.35,1.0,0.25,1.0
2,0.15,0.15018,4666.0,1167.0,0.8536,0.8496,0.0,0.0,0.6462,0.6538,0.59205,0.60755,...,0.05,0.05,0.05,1.0,0.0,0.0,0.45,1.0,0.15,1.0
2,0.2,0.200229,3500.0,875.0,0.85245,0.85115,0.0,0.0,0.69015,0.69375,0.6728,0.6786,...,0.1,0.0,0.05,1.0,0.0,0.0,0.4,1.0,0.4,1.0
3,0.05,0.050137,27955.0,6989.0,0.8074,0.80875,0.0,0.0,0.36275,0.37215,0.2548,0.2644,...,0.3,0.0,0.0,1.0,0.0,0.0,0.6,0.9,0.4,1.0
3,0.1,0.100258,13980.0,3495.0,0.8082,0.8061,0.0,0.0,0.5058,0.5191,0.42195,0.44415,...,0.05,0.0,0.0,1.0,0.0,0.0,0.6,0.95,0.45,1.0
3,0.15,0.15036,9321.0,2331.0,0.8086,0.80645,0.0,0.0,0.5852,0.58465,0.54925,0.54525,...,0.0,0.05,0.0,1.0,0.1,0.0,0.35,0.95,0.35,1.0
3,0.2,0.200458,6992.0,1748.0,0.80775,0.8095,0.0,0.0,0.62845,0.63405,0.63075,0.6355,...,0.05,0.05,0.0,1.0,0.0,0.0,0.7,1.0,0.5,1.0
4,0.05,0.050206,41875.0,10469.0,0.7623,0.76395,0.0,0.0,0.2965,0.30605,0.20275,0.21275,...,0.3,0.0,0.0,1.0,0.0,0.0,0.75,0.85,0.4,1.0
4,0.1,0.10039,20942.0,5236.0,0.76245,0.7633,0.0,0.0,0.42685,0.4351,0.3455,0.3576,...,0.15,0.15,0.0,1.0,0.0,0.0,0.6,1.0,0.65,1.0


In [15]:
round(res_means[np.in1d(res_means.index.get_level_values(0), list(range(4,9)))].mean(), 2)
# res_means[res_means].mean()

Train size                          36242.15
Test size                            9061.00
Train AUC                               0.71
Test AUC                                0.71
Train KS-test p-value                   0.00
Test KS-test p-value                    0.00
Train F1-score                          0.37
Test F1-score                           0.37
Train precision                         0.33
Test precision                          0.33
Train recall                            0.47
Test recall                             0.48
Hurst                                   0.00
Correlation Dimension                   0.96
Lyapunov                                0.08
Variance                                0.15
PSD                                     0.28
ACF_1                                   1.00
Hurst_8_dyn                             0.01
Hurst_8_Variance                        0.13
Correlation Dimension_8_dyn             0.12
Correlation Dimension_8_Variance        0.23
Lyapunov_8

In [None]:
# Vizual check of the single model
data_testing = data_logdyn.copy()
data_testing['Flag'] = data_testing['Distance'].apply(lambda x: 0 if x >= 8 else 1)
data_testing.drop(columns = ['Volume', 'MA100', 'Rise', 'Distance', 'Index', 'Ticker'], inplace = True)

data_testing_1 = data_testing[data_testing[target] == 1]
data_testing_0 = data_testing[data_testing[target] == 0]
Y_1 = data_testing_1[target]
X_1 = data_testing_1.drop(columns = [target])
_, X_0, _, Y_0 = modsel.train_test_split(data_testing_0.drop(columns = [target]), data_testing_0[target], 
                                                     test_size = min(share_1_orig * (1 - share) / share, 1), random_state = state)
share_1 = len(Y_1) / (len(Y_0) + len(Y_1))
Y = pd.concat([Y_0, Y_1])
X = sm.add_constant(pd.concat([X_0, X_1]))
X_train, X_test, Y_train, Y_test = modsel.train_test_split(X, Y, test_size = 0.2, random_state = state)
results_rs, auc_train_rs, auc_test_rs, ks_train_rs, ks_test_rs, f1_train_rs, f1_test_rs = model_optimization(Y_train, Y_test, X_train, X_test, silent = True)

Y_test_pred = results_rs.predict(X_test)
ks_samples = pd.DataFrame({'Y': Y_test, 'Y_pred': Y_test_pred})
ks_samples_posi = ks_samples[ks_samples['Y'] == 1]['Y_pred']
ks_samples_nega = ks_samples[ks_samples['Y'] == 0]['Y_pred']
fig = go.Figure()
fig.add_trace(go.Histogram(x = ks_samples_posi, name = 'Posi'))
fig.add_trace(go.Histogram(x = ks_samples_nega, name = 'Nega'))
fig.update_layout(barmode = 'overlay')
fig.update_traces(opacity = 0.75)
fig.show()