In [1]:
import operator
import numpy as np
import scipy as sp
import pandas as pd
from tqdm import tqdm
import plotly.io as pio
import statsmodels.api as sm
import plotly.graph_objects as go
import sklearn.metrics as metrics
import sklearn.model_selection as modsel

import warnings
warnings.filterwarnings("ignore")
pio.templates.default = "plotly_dark"

In [2]:
def roc_metric(Y, Y_pred):
    """
    Function for the calculation of AUC metric

    Inputs:
    ----------
    Y : DataFrame
        Set of Y for the model
    Y_pred : DataFrame
        Set of predicted Y for the model

    Returns:
    ----------
    
    """

    fpr, tpr, thresholds = metrics.roc_curve(Y, Y_pred, pos_label=1)
    auc = round(metrics.auc(fpr, tpr), 3)
    optimal_index = np.argmax(tpr - fpr)

    return auc, thresholds[optimal_index]

#---------------------------------------------------------------------------------------

def remove_most_insignificant(X, results):
    """
    Function for the removal of the most insignificant variables from the model

    Inputs:
    ----------
    X : DataFrame
        Set of X for the model
    results : model
        Fitted statsmodels model

    Returns:
    ----------
    X_val : DataFrame
        Optimized set of X for the validation of the model
    X_test : DataFrame
        Optimized set of X for the testing of the model
    """
    
    # Use operator to find the key which belongs to the maximum value in the dictionary
    max_p_value = max(results.pvalues.iteritems(), key = operator.itemgetter(1))[0]
    # Drop the worst feature
    X.drop(columns = max_p_value, inplace = True)

    return X

#---------------------------------------------------------------------------------------

def model_optimization(Y, 
                       X, 
                       type:str = 'Probit', 
                       p_value_bord:float = 0.05, 
                       silent:bool = False):
    """
    Function for the optimization of OLS

    Inputs:
    ----------
    Y : array
        Target variable for the regression
    X : DataFrame
        Set of X for the model
    type : str = 'Probit'
        Type of the model
    p_value_bord : float = 0.05
        Maximum acceptable p-value for the coefficient
    silent : bool = False
        Whether not to show reports about model

    Returns:
    ----------
    results : model
        Fitted statsmodels model
    val_rmse : float
        RMSE score for the validation
    test_rmse : float
        RMSE score for the test
    Y_val_pred : array
        Prediction for the validation
    Y_test_pred : array
        Prediction for the test
    """
    
    insignificant_feature = True
    while insignificant_feature:
        # Create model
        if type == 'Probit':
            model = sm.Probit(Y, X)
        else:
            model = sm.Logit(Y, X)

        # Fit model and get
        results = model.fit(disp = 0)
        significant = [p_value < p_value_bord for p_value in results.pvalues]
        if all(significant):
            insignificant_feature = False
        else:
            # If there's only one insignificant variable left
            if X.shape[1] == 1:
                print('No significant features found')
                results = None
                insignificant_feature = False
            else:
                X = remove_most_insignificant(X, results)
    
    Y_pred = results.predict(X)
    auc, threshold = roc_metric(Y, Y_pred)
    Y_pred_round = np.where(Y_pred < threshold, np.floor(Y_pred), np.ceil(Y_pred))
    ks_samples = pd.DataFrame({'Y': Y, 'Y_pred': Y_pred})
    ks_samples_posi = ks_samples[ks_samples['Y'] == 1]['Y_pred']
    ks_samples_nega = ks_samples[ks_samples['Y'] == 0]['Y_pred']
    ks = sp.stats.kstest(ks_samples_posi, ks_samples_nega)
    f1 = round(metrics.f1_score(Y, Y_pred_round), 2)
    if silent == False:
        print(f'AUC score: {auc}, KS-test p-value: {round(ks.pvalue, 3)}, F1-score: {f1}')
        print(results.summary())

    return results, auc, round(ks.pvalue, 9), f1

In [17]:
# Read dataset and define columns for feature generation
data = pd.read_parquet('Data/dataset.parquet')
indices = data.groupby(['Ticker', 'Index']).size().index.values
cols = ['Hurst', 'Correlation Dimension', 'Lyapunov', 'Variance', 'PSD', 'ACF_1']

# Set lag for dynamics and short variance calculation
lag_model = 10

# Calculate dynamics and short variance
# Original idea about variance was born from the largest Lyapunov exponent's behaviour before the critical transition point:
# is mostly didn't move in nominal values but its variance in some cases decreased signigicantly 
data_logdyn = pd.DataFrame()
for ind in tqdm(indices):
    data_ind = data[(data['Ticker'] == ind[0]) & (data['Index'] == ind[1])]
    for col in cols:
        data_ind[col + '_' + str(lag_model) + '_dyn'] = data_ind[col] / data_ind[col].shift(lag_model) - 1
        data_ind[col + '_' + 'Variance'] = data_ind[col].rolling(lag_model).var()
    data_ind.dropna(inplace = True)
    data_logdyn = pd.concat([data_logdyn, data_ind])

# Reset index to get rid of dates and save final dataset
data_logdyn.reset_index(drop = True, inplace = True)
data_logdyn = data_logdyn[data_logdyn['Distance'] > 0]
data_logdyn.to_parquet('Data/final_dataset.parquet')
data_logdyn

100%|██████████| 876/876 [00:56<00:00, 15.63it/s]


Unnamed: 0,Volume,MA100,Rise,Distance,Index,Ticker,Hurst,Correlation Dimension,Lyapunov,Variance,...,Correlation Dimension_10_dyn,Correlation Dimension_Variance,Lyapunov_10_dyn,Lyapunov_Variance,Variance_10_dyn,Variance_Variance,PSD_10_dyn,PSD_Variance,ACF_1_10_dyn,ACF_1_Variance
0,193237.0,127637.89,False,689,2175,A,0.428623,7.873735e-18,0.008117,1.082838e+10,...,-1.075298,2.894832e-31,0.502499,0.000002,0.001090,1.007442e+15,0.018504,0.000013,0.008858,0.000010
1,126036.0,126869.77,False,688,2175,A,0.342863,1.535563e-15,0.005302,1.080689e+10,...,-0.051938,2.619049e-31,-0.166924,0.000002,-0.002357,9.175725e+14,0.018222,0.000014,0.003414,0.000010
2,112081.0,126835.09,False,687,2175,A,0.401636,1.946797e-16,0.006881,1.078341e+10,...,-1.791054,2.476123e-31,0.182500,0.000002,-0.002212,9.493537e+14,0.017401,0.000015,0.006353,0.000011
3,70429.0,126321.60,False,686,2175,A,0.383883,9.473740e-17,0.009417,1.078750e+10,...,-2.677941,2.432220e-31,0.381853,0.000003,0.000997,9.050450e+14,0.018848,0.000015,0.012764,0.000009
4,118291.0,126230.94,False,685,2175,A,0.385216,1.566568e-16,0.008132,1.080631e+10,...,2.433535,2.416978e-31,0.319977,0.000003,0.005231,5.821026e+14,0.020919,0.000015,0.018105,0.000005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647309,305417.0,163643.22,False,5,2923,ZWS,0.460507,5.501340e-16,0.004526,8.417522e+09,...,2.362234,8.082636e-31,0.638362,0.000003,0.504284,7.309311e+17,0.072543,0.000114,0.341586,0.002174
647310,208657.0,164804.55,False,4,2923,ZWS,0.444376,5.657263e-16,0.005613,8.520107e+09,...,1.511423,8.063026e-31,1.106991,0.000005,0.464108,6.627906e+17,0.071334,0.000115,0.403682,0.001762
647311,1369475.0,177527.24,False,3,2923,ZWS,0.346221,6.006133e-16,0.003566,8.518577e+09,...,0.567714,8.083593e-31,17.557383,0.000004,0.327211,6.372140e+17,0.071266,0.000117,0.338023,0.001317
647312,680336.0,183571.70,False,2,2923,ZWS,0.395942,-4.496695e-18,0.013285,1.363991e+10,...,3.206594,8.087118e-31,8.750925,0.000016,1.029884,4.028828e+18,0.071288,0.000120,-0.196847,0.003063


In the cell below we are iterating over the three lists of parameters:
- horizons - how many hours before the transition are considered to be close enough to be prediction phase
- sizes - share of the positive observations in the whole modelling dataset - this parameter is important because in the original dataset share of positives for some of the horizons was to small, so we dicided to use decrease size of the negative dataset and randomize it
- states - in order to avoid lucky random choices in the sizes randomization we are use a list of different random states to average the results

In [18]:
# Read dataset
data_logdyn = pd.read_parquet('Data/final_dataset.parquet')

# Choose binary target and other parameters
target = 'Flag'
horizons = list(range(4, 17))
sizes = np.linspace(0.05, 0.1, 3)
states = list(range(0, 10000, 500))
res = pd.DataFrame(columns = ['Horizon', 'Size', '1 Share', 'State', 'AUC', 'KS-test p-value', 'F1-score', 'Variables'])
for horizon in tqdm(horizons):
    data_testing = data_logdyn.copy()
    data_testing['Flag'] = data_testing['Distance'].apply(lambda x: 0 if x >= horizon else 1)
    data_testing.drop(columns = ['Volume', 'MA100', 'Rise', 'Distance', 'Index', 'Ticker'], inplace = True)
    
    data_testing_1 = data_testing[data_testing[target] == 1]
    data_testing_0 = data_testing[data_testing[target] == 0]
    Y_1 = data_testing_1[target]
    X_1 = data_testing_1.drop(columns = [target])
    share_1_orig = len(data_testing_1) / (len(data_testing_0) + len(data_testing_1))
    for size in sizes:
        for state in states:
            _, X_0, _, Y_0 = modsel.train_test_split(data_testing_0.drop(columns = [target]), data_testing_0[target], 
                                                     test_size = min(share_1_orig * (1 - size) / size, 1), random_state = state)
            share_1 = len(Y_1) / (len(Y_0) + len(Y_1))
            Y = pd.concat([Y_0, Y_1])
            X = sm.add_constant(pd.concat([X_0, X_1]))
            results_rs, auc_rs, ks_rs, f1_rs = model_optimization(Y, X, silent = True)
            res.loc[len(res)] = [horizon, size, share_1, state, auc_rs, ks_rs, f1_rs, list(results_rs.params.index)]

res_counts = res['Variables'].to_frame()
for col in list(X_1.columns) + ['const']:
    res_counts[col] = res_counts['Variables'].apply(lambda x: 1 if col in x else 0)
res = res.drop(columns = ['Variables']).join(res_counts.drop(columns = ['Variables']))
res.to_parquet('Data/params.parquet')

groups = ['Horizon', 'Size', '1 Share']
drops = ['State']
res_means = res.groupby(groups)[res.columns.drop(groups + drops)].mean()
res_means.to_parquet('Data/params_mean.parquet')
res_means

100%|██████████| 13/13 [1:33:16<00:00, 430.52s/it]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AUC,KS-test p-value,F1-score,Hurst,Correlation Dimension,Lyapunov,Variance,PSD,ACF_1,Hurst_10_dyn,...,Correlation Dimension_Variance,Lyapunov_10_dyn,Lyapunov_Variance,Variance_10_dyn,Variance_Variance,PSD_10_dyn,PSD_Variance,ACF_1_10_dyn,ACF_1_Variance,const
Horizon,Size,1 Share,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
4,0.05,0.050207,0.7508,0.0,0.279,0.0,1.0,0.4,0.0,0.35,1.0,0.0,...,0.1,0.0,0.0,1.0,0.0,0.0,0.95,0.9,0.65,1.0
4,0.075,0.075303,0.75125,0.0,0.352,0.0,1.0,0.35,0.0,0.3,1.0,0.0,...,0.05,0.0,0.0,1.0,0.0,0.0,0.9,0.9,0.85,1.0
4,0.1,0.10039,0.7518,0.0,0.41,0.0,0.95,0.2,0.0,0.55,1.0,0.0,...,0.1,0.0,0.0,1.0,0.0,0.0,0.75,0.9,0.9,1.0
5,0.05,0.050277,0.7163,0.0,0.2415,0.0,1.0,0.5,0.05,0.5,1.0,0.0,...,0.45,0.0,0.0,1.0,0.0,0.0,1.0,0.9,0.7,1.0
5,0.075,0.075403,0.71675,0.0,0.307,0.0,1.0,0.35,0.15,0.4,1.0,0.0,...,0.2,0.0,0.0,1.0,0.0,0.0,0.85,0.9,0.9,1.0
5,0.1,0.100525,0.71745,0.0,0.3575,0.0,1.0,0.15,0.3,0.5,1.0,0.0,...,0.25,0.0,0.0,1.0,0.0,0.0,0.9,0.9,0.95,1.0
6,0.05,0.050347,0.6945,0.0,0.2205,0.0,1.0,0.4,0.25,0.3,1.0,0.0,...,0.65,0.0,0.0,1.0,0.0,0.0,1.0,0.8,0.85,1.0
6,0.075,0.075507,0.69485,0.0,0.281,0.0,1.0,0.25,0.2,0.25,1.0,0.0,...,0.25,0.0,0.0,1.0,0.0,0.0,1.0,0.85,0.95,1.0
6,0.1,0.100657,0.6952,0.0,0.329,0.0,1.0,0.2,0.35,0.35,1.0,0.0,...,0.25,0.0,0.0,1.0,0.0,0.0,0.95,0.9,0.95,1.0
7,0.05,0.050417,0.67875,0.0,0.205,0.0,1.0,0.45,0.3,0.2,1.0,0.0,...,0.7,0.0,0.0,1.0,0.0,0.0,1.0,0.85,1.0,1.0


In [19]:
round(res_means[np.in1d(res_means.index.get_level_values(0), list(range(4,9)))].mean(), 2)
# res_means[res_means].mean()

AUC                               0.70
KS-test p-value                   0.00
F1-score                          0.29
Hurst                             0.00
Correlation Dimension             1.00
Lyapunov                          0.24
Variance                          0.30
PSD                               0.45
ACF_1                             1.00
Hurst_10_dyn                      0.00
Hurst_Variance                    0.00
Correlation Dimension_10_dyn      0.10
Correlation Dimension_Variance    0.35
Lyapunov_10_dyn                   0.00
Lyapunov_Variance                 0.00
Variance_10_dyn                   1.00
Variance_Variance                 0.00
PSD_10_dyn                        0.00
PSD_Variance                      0.95
ACF_1_10_dyn                      0.91
ACF_1_Variance                    0.91
const                             1.00
dtype: float64

In [20]:
# Vizual check of the single model
data_testing = data_logdyn.copy()
data_testing['Flag'] = data_testing['Distance'].apply(lambda x: 0 if x >= 8 else 1)
data_testing.drop(columns = ['Volume', 'MA100', 'Rise', 'Distance', 'Index', 'Ticker'], inplace = True)

data_testing_1 = data_testing[data_testing[target] == 1]
data_testing_0 = data_testing[data_testing[target] == 0]
Y_1 = data_testing_1[target]
X_1 = data_testing_1.drop(columns = [target])
_, X_0, _, Y_0 = modsel.train_test_split(data_testing_0.drop(columns = [target]), data_testing_0[target], 
                                         test_size = 0.05, random_state = 2000)
Y = pd.concat([Y_0, Y_1])
X = sm.add_constant(pd.concat([X_0, X_1]))
results_rs, auc_rs, ks_rs, f1_rs = model_optimization(Y, X, silent = True)
Y_pred = results_rs.predict()
ks_samples = pd.DataFrame({'Y': Y, 'Y_pred': Y_pred})
ks_samples_posi = ks_samples[ks_samples['Y'] == 1]['Y_pred']
ks_samples_nega = ks_samples[ks_samples['Y'] == 0]['Y_pred']
fig = go.Figure()
fig.add_trace(go.Histogram(x = ks_samples_posi, name = 'Posi'))
fig.add_trace(go.Histogram(x = ks_samples_nega, name = 'Nega'))
fig.update_layout(barmode = 'overlay')
fig.update_traces(opacity = 0.75)
fig.show()