In [1]:
import nolds
import numpy as np
import scipy as sp
import pandas as pd
import yfinance as yf
from tqdm import tqdm
import plotly.io as pio
import statsmodels.api as sm
import plotly.graph_objects as go

import warnings
warnings.filterwarnings("ignore")
pio.templates.default = "plotly_dark"

window = str(100)
window_metrics = 300
distance = 1000
tail = 50

### Data Import

In [76]:
tickers = pd.read_csv('Data/IWV_holdings.csv', skiprows = 9)
tickers['Len'] = tickers['Ticker'].apply(lambda x: len(x))
tickers = tickers[(tickers['Len'] < 5) & ~(tickers['Ticker'].isin(['-', '\xa0'])) & ~(tickers['Ticker'].isna())]['Ticker']
tickers

0       MSFT
1       AAPL
2       NVDA
3       AMZN
4       META
        ... 
2693    CMAX
2694     EVA
2696    GTXI
2697    CANO
2702    ESH4
Name: Ticker, Length: 2666, dtype: object

In [77]:
history = yf.download(tickers = list(tickers.values), period = "2y", interval = "1h", group_by = 'ticker', auto_adjust = True, progress = True, threads = True)
history.to_parquet('C:/Users/user/Downloads/imported_1h.parquet')

[*********************100%***********************]  2665 of 2665 completed

15 Failed downloads:
- HEIA: No data found, symbol may be delisted
- LGFA: No data found, symbol may be delisted
- BFB: No data found, symbol may be delisted
- ADRO: No data found, symbol may be delisted
- BRKB: No data found, symbol may be delisted
- FCFS: No data found for this date range, symbol may be delisted
- LGFB: No data found, symbol may be delisted
- ESH4: No data found, symbol may be delisted
- GEFB: No data found, symbol may be delisted
- MOGA: No data found, symbol may be delisted
- LENB: No data found, symbol may be delisted
- JBT: No data found for this date range, symbol may be delisted
- GTXI: No data found, symbol may be delisted
- ARD: No data found for this date range, symbol may be delisted
- BFA: No data found, symbol may be delisted


### Choice parameters selection

In [110]:
test = pd.read_csv('C:/Users/user/OneDrive/P&D/Диплом Б/Диплом/TSLA.csv')['Volume'].to_frame()
test['MA' + window] = test['Volume'].rolling(int(window)).mean()
test['Dynamics'] = test['MA' + window] / test['MA' + window].shift(10)
test['Rise'] = (test['Dynamics'] > 1.2)
test

Unnamed: 0,Volume,MA200,Dynamics,Rise
0,93831500,,,False
1,85935500,,,False
2,41094000,,,False
3,25699000,,,False
4,34334500,,,False
...,...,...,...,...
3038,21357800,27910389.5,1.016810,False
3039,22273600,27925778.5,1.014163,False
3040,29370000,27988935.5,1.014379,False
3041,28241000,28059139.0,1.017271,False


In [111]:
fig = go.Figure()
fig.add_trace(go.Scatter(y = test['Volume'], mode = 'lines', name = 'Volume'))
fig.add_trace(go.Scatter(y = test['MA' + window], mode = 'lines', name = 'MA' + window))
fig.add_vline(x = test[test['Rise']].head(1).index[0])
fig.show()

### Data Selection

#### Calculation of the dynamics and definition of the transition with MA

In [3]:
data = pd.read_parquet('C:/Users/user/Downloads/imported_1h.parquet')
data.dropna(axis = 1, how = 'all', inplace = True)
data.replace(0, np.nan, inplace = True)

active_tickers = []
for i in range(len(data.columns)):
    active_tickers.append(data.columns[i][0])
active_tickers = set(active_tickers)

data_final = pd.DataFrame(data.index).set_index(0)
data_final.index.name = None
for ticker in tqdm(active_tickers):
    data_ticker = data[(ticker, 'Volume')].to_frame().dropna()
    data_ticker[(ticker, 'MA' + window)] = data_ticker[(ticker, 'Volume')].rolling(int(window)).mean()
    data_ticker[(ticker, 'Dynamics')] = data_ticker[(ticker, 'MA' + window)] / data_ticker[(ticker, 'MA' + window)].shift(5)
    data_ticker[(ticker, 'Rise')] = data_ticker[(ticker, 'Dynamics')] > 1.2
    data_final = data_final.join(data_ticker)
data_final

  0%|          | 0/2650 [00:00<?, ?it/s]

Unnamed: 0,"(CDMO, Volume)","(CDMO, MA100)","(CDMO, Dynamics)","(CDMO, Rise)","(SWN, Volume)","(SWN, MA100)","(SWN, Dynamics)","(SWN, Rise)","(TKR, Volume)","(TKR, MA100)",...,"(HSTM, Dynamics)","(HSTM, Rise)","(FOX, Volume)","(FOX, MA100)","(FOX, Dynamics)","(FOX, Rise)","(CTRE, Volume)","(CTRE, MA100)","(CTRE, Dynamics)","(CTRE, Rise)"
2022-02-07 09:30:00-05:00,51992.0,,,False,4363256.0,,,False,,,...,,,,,,,48918.0,,,False
2022-02-07 10:30:00-05:00,43041.0,,,False,2605308.0,,,False,75329.0,,...,,False,57734.0,,,False,44098.0,,,False
2022-02-07 11:30:00-05:00,35877.0,,,False,2113814.0,,,False,94150.0,,...,,False,44723.0,,,False,58420.0,,,False
2022-02-07 12:30:00-05:00,28719.0,,,False,1271200.0,,,False,62655.0,,...,,False,105235.0,,,False,103445.0,,,False
2022-02-07 13:30:00-05:00,42017.0,,,False,1655240.0,,,False,58154.0,,...,,False,97907.0,,,False,76958.0,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-05 11:30:00-05:00,72609.0,111909.16,1.010306,False,947913.0,2531894.95,0.953514,False,197493.0,63504.07,...,0.988673,False,76461.0,101580.01,1.007111,False,491343.0,110126.97,1.036315,False
2024-02-05 12:30:00-05:00,58516.0,111540.76,0.998611,False,1988483.0,2523011.45,0.958046,False,167471.0,64831.52,...,0.982685,False,85002.0,101274.59,1.002471,False,78959.0,109910.53,1.027962,False
2024-02-05 13:30:00-05:00,54540.0,111608.08,0.984815,False,2715476.0,2521006.84,0.962974,False,130160.0,66003.14,...,0.965894,False,83788.0,101524.12,0.992078,False,129814.0,110377.29,1.025555,False
2024-02-05 14:30:00-05:00,80289.0,111877.02,0.983937,False,1904165.0,2506791.53,0.965364,False,163500.0,67158.64,...,0.965928,False,100562.0,101943.35,0.991226,False,194437.0,111508.98,1.048421,False


In [21]:
pos = 0
ticks = []

active_tickers = []
for i in range(len(data_final.columns)):
    active_tickers.append(data_final.columns[i][0])
active_tickers = set(active_tickers)

data_selected = pd.DataFrame(data_final.index).set_index(0)
data_selected.index.name = None
for ticker in tqdm(active_tickers):
    num_ma = len(data_final[(ticker, 'MA' + window)].dropna())
    if num_ma > 3000:
        if  num_ma / 300 > len(data_final[data_final[(ticker, 'Rise')] == True]) > 0:
            pos += 1
            data_selected = data_selected.join(data_final[[(ticker, 'Volume'), (ticker, 'MA' + window), (ticker, 'Rise')]])
            ticks.append(ticker)
print(pos)
str_cols = []
for col in data_selected.columns:
    str_cols.append(str(col).replace('(', '').replace(')', '').replace("'", ''))
data_selected.columns = str_cols
data_selected.to_parquet('Data/filtered_1h.parquet')
data_selected

  0%|          | 0/2650 [00:00<?, ?it/s]

100%|██████████| 2650/2650 [00:29<00:00, 89.20it/s] 


889


Unnamed: 0,"SWN, Volume","SWN, MA100","SWN, Rise","TKR, Volume","TKR, MA100","TKR, Rise","AXP, Volume","AXP, MA100","AXP, Rise","NPK, Volume",...,"LAZ, Rise","HSTM, Volume","HSTM, MA100","HSTM, Rise","CWK, Volume","CWK, MA100","CWK, Rise","CTRE, Volume","CTRE, MA100","CTRE, Rise"
2022-02-07 09:30:00-05:00,4363256.0,,False,,,,554587.0,,False,,...,False,,,,89387.0,,False,48918.0,,False
2022-02-07 10:30:00-05:00,2605308.0,,False,75329.0,,False,316824.0,,False,3180.0,...,False,3149.0,,False,42540.0,,False,44098.0,,False
2022-02-07 11:30:00-05:00,2113814.0,,False,94150.0,,False,219884.0,,False,1392.0,...,False,5362.0,,False,43516.0,,False,58420.0,,False
2022-02-07 12:30:00-05:00,1271200.0,,False,62655.0,,False,271801.0,,False,2161.0,...,False,3166.0,,False,59953.0,,False,103445.0,,False
2022-02-07 13:30:00-05:00,1655240.0,,False,58154.0,,False,219220.0,,False,3136.0,...,False,11736.0,,False,27337.0,,False,76958.0,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-05 11:30:00-05:00,947913.0,2531894.95,False,197493.0,63504.07,False,304848.0,476615.34,False,1232.0,...,False,2103.0,5925.01,False,131528.0,220740.85,False,491343.0,110126.97,False
2024-02-05 12:30:00-05:00,1988483.0,2523011.45,False,167471.0,64831.52,False,219253.0,475801.77,False,1622.0,...,False,3798.0,5874.99,False,91418.0,220338.91,False,78959.0,109910.53,False
2024-02-05 13:30:00-05:00,2715476.0,2521006.84,False,130160.0,66003.14,False,203205.0,473737.69,False,2128.0,...,False,4124.0,5851.51,False,116268.0,220293.30,False,129814.0,110377.29,False
2024-02-05 14:30:00-05:00,1904165.0,2506791.53,False,163500.0,67158.64,False,380821.0,474993.09,False,1511.0,...,False,3950.0,5864.47,False,158176.0,220469.94,False,194437.0,111508.98,False


#### Calculation of the metrics

In [3]:
data_selected = pd.read_parquet('Data/filtered_1h.parquet')

active_tickers = []
for i in data_selected.columns:
    if i.__contains__('Rise'):
        active_tickers.append(i.replace(', Rise', ''))
active_tickers = set(active_tickers)

ds = pd.DataFrame()
for ticker in tqdm(active_tickers):
    data_ticker = data_selected[[ticker + ', Volume', ticker + ', MA' + window, ticker + ', Rise']].dropna().reset_index(drop = True)
    data_ticker.rename(columns = {ticker + ', Volume': 'Volume', ticker + ', MA' + window: 'MA' + window, ticker + ', Rise': 'Rise'}, inplace = True)
    rises_ticker = data_ticker[data_ticker['Rise'] == True].index
    max_index = data_ticker.index.max()
    ind_0 = 0
    for ind in rises_ticker:
        if (ind >= ind_0 + distance) & (ind <= max_index - tail):
            ds_ticker_ind = data_ticker.iloc[ind - distance: ind + tail]
            ds_ticker_ind['Distance'] = - ds_ticker_ind.index + ind
            ds_ticker_ind['Index'] = ind
            ds_ticker_ind['Ticker'] = ticker
            ds_ticker_ind.reset_index(drop = True, inplace = True)

            Hurst = []
            corr_dim = []
            l_exp = []
            var = []
            PSD = []
            acf_1 = []
            l_exp_var = []

            for j in range(len(ds_ticker_ind)):
                if j <= window_metrics:
                    Hurst.append(None)
                    corr_dim.append(None)
                    l_exp.append(None)
                    var.append(None)
                    PSD.append(None)
                    acf_1.append(None)
                else:
                    data_before_j = ds_ticker_ind.iloc[j - window_metrics : j]

                    Hurst_j = nolds.hurst_rs(data_before_j['Volume'])
                    Hurst.append(Hurst_j)

                    corr_dim_j = nolds.corr_dim(data_before_j['Volume'], 10)
                    corr_dim.append(corr_dim_j)

                    l_exp_j = nolds.lyap_r(data_before_j['Volume'])
                    l_exp.append(l_exp_j)

                    var_j = data_before_j['Volume'].var()
                    var.append(var_j)

                    freq, psd = sp.signal.welch(data_before_j['Volume'])
                    PSD_j = np.polyfit(np.log(freq)[1:j], np.log(psd)[1:j], 1)[0]
                    PSD.append(PSD_j)

                    acf_i = sm.tsa.acf(data_before_j['Volume'], nlags = 1)
                    acf_1.append(acf_i[1])

            ds_ticker_ind['Hurst'] = Hurst
            ds_ticker_ind['Correlation Dimension'] = corr_dim
            ds_ticker_ind['Lyapunov'] = l_exp
            ds_ticker_ind['Variance'] = var
            ds_ticker_ind['PSD'] = PSD
            ds_ticker_ind['ACF_1'] = acf_1

            # for j in range(len(ds_ticker_ind)):
            #     if j <= window_metrics + int(window):
            #         l_exp_var.append(None)
            #     else:
            #         data_before_j = ds_ticker_ind.iloc[j - int(window) : j]

            #         l_exp_var_j = data_before_j['Lyapunov'].var()
            #         l_exp_var.append(l_exp_var_j)

            # ds_ticker_ind['Lyapunov Variance'] = l_exp_var
            ds_ticker_ind.dropna(inplace = True)

            ds = pd.concat([ds, ds_ticker_ind])
            ind_0 = ind
ds.reset_index(drop = True, inplace = True)
ds.to_parquet('Data/dataset.parquet')
ds

100%|██████████| 889/889 [11:56:57<00:00, 48.39s/it]   


Unnamed: 0,Volume,MA100,Rise,Distance,Index,Ticker,Hurst,Correlation Dimension,Lyapunov,Variance,PSD,ACF_1
0,121268.0,251210.63,False,1000,3156,GPN,,,,,,
1,339348.0,250504.94,False,999,3156,GPN,,,,,,
2,1444842.0,258651.90,False,998,3156,GPN,,,,,,
3,253160.0,250964.96,False,997,3156,GPN,,,,,,
4,242106.0,244523.56,False,996,3156,GPN,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
919795,66213.0,71594.99,False,-45,2069,GRBK,0.606235,0.022539,0.014606,2.424159e+09,-0.522894,0.568950
919796,69839.0,72107.72,False,-46,2069,GRBK,0.608672,0.019699,0.017425,2.422550e+09,-0.520908,0.570781
919797,108197.0,72999.96,False,-47,2069,GRBK,0.607950,0.019699,0.015398,2.423532e+09,-0.519576,0.571200
919798,58318.0,73345.48,False,-48,2069,GRBK,0.580916,0.020781,0.017768,2.435413e+09,-0.515678,0.570239
