In [110]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import os
import statsmodels 
import seaborn as sns
import scipy.stats as ss
import statsmodels.graphics.tsaplots as sgt
import statsmodels.tsa.stattools as sts 
import sklearn
import arch
import datetime as dt


from pmdarima.arima import auto_arima
from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox, het_arch
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from tqdm import tqdm
from time import sleep
sns.set()

In [None]:
def read_csv(name:str):
    df = pd.read_csv(f'../data/processed/{name}.csv')
    df.Date = pd.to_datetime(df.Date)
    df.set_index('Date', inplace = True)
    df.asfreq('12H')
    df.dropna(inplace = True)
    return df

In [None]:
df= read_csv('data_processed')
df_test = read_csv('data_test')

In [None]:
df.head()

Unnamed: 0_level_0,BTCUSDT,ETHUSDT,ADAUSDT,BNBUSDT,Ret_BTCUSDT,Ret_cum_BTCUSDT,Norm_BTCUSDT,Ret_ETHUSDT,Ret_cum_ETHUSDT,Norm_ETHUSDT,Ret_ADAUSDT,Ret_cum_ADAUSDT,Norm_ADAUSDT,Ret_BNBUSDT,Ret_cum_BNBUSDT,Norm_BNBUSDT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2018-06-01 12:00:00,7521.01,579.0,0.22038,14.2888,2.16834,2.16834,102.16834,1.4668,1.4668,101.4668,2.122335,2.122335,102.122335,3.692308,3.692308,103.692308
2018-06-02 00:00:00,7652.28,592.73,0.22428,14.5,1.745377,3.913718,103.951563,2.37133,3.83813,103.872912,1.769671,3.892006,103.929564,1.478081,5.170388,105.224964
2018-06-02 12:00:00,7640.03,590.85,0.22648,14.6732,-0.160083,3.753635,103.785155,-0.317176,3.520953,103.543452,0.980917,4.872923,104.949027,1.194483,6.364871,106.481858
2018-06-03 00:00:00,7714.85,619.93,0.23284,14.7861,0.979316,4.73295,104.801539,4.921723,8.442676,108.639574,2.808195,7.681118,107.8962,0.76943,7.134301,107.301161
2018-06-03 12:00:00,7714.26,619.66,0.22659,14.6995,-0.007648,4.725303,104.793524,-0.043553,8.399123,108.592258,-2.684247,4.996871,105.0,-0.585685,6.548616,106.672714


In [38]:
def fit_auto(df, target, exogenous=None, exog=True, **kwargs):
    if exog:
        model = auto_arima(df[[f'Ret_{target}']], X=df[exogenous], 
                       max_order = None, max_p = 9, max_q = 9, max_d = 2, max_P = 6, max_Q = 6, max_D = 4,
                       maxiter = 70, trend = 'ct')
        print(model.summary())
    else:
        model = auto_arima(df[[f'Ret_{target}']], 
                       max_order = None, max_p = 9, max_q = 9, max_d = 2, max_P = 6, max_Q = 6, max_D = 4,
                       maxiter = 70, trend = 'ct')
        print(model.summary())
    
    return model




In [169]:


def analyze_residuals(model, name:str):
    # Summary statistics
    residuals_df = pd.DataFrame(model.resid())
    print(residuals_df.describe())

    residuals = model.resid()

    # Convert lags to list
    lags = [1]

    # Perform Ljung-Box test
    lb_test = acorr_ljungbox(residuals, lags=lags, return_df=True)
    lb_p_value = lb_test.iloc[0, 1]

    # Perform Jarque-Bera test
    jb_stat, jb_p_value, skew, kurtosis = jarque_bera(residuals)

    # Perform heteroskedasticity test
    het_test = het_arch(residuals)
    het_p_value = het_test[1]

    # Perform ADF test
    adf_test = adfuller(residuals)
    adf_p_value = round(adf_test[1], 3)

    # Print summary of analysis
    print("Residual Analysis:")
    print("------------------"*2)
    print(f"\nLjung-Box (lag 1) p-value: {lb_p_value}")
    print(f"\nJarque-Bera p-value: {jb_p_value}")
    print(f"\nHeteroskedasticity p-value: {het_p_value}")
    print(f"\nADF p-value: {adf_p_value}")

    # Autocorrelation plot
    fig, ax = plt.subplots(figsize=(7, 4))
    plot_acf(residuals_df, lags=30, ax=ax)
    plt.title(f'Autocorrelation of Residuals_{name}')
    plt.xlabel('Lag')
    plt.ylabel('Autocorrelation')
    plt.savefig(f'../reports/figures/{name}_resid_acf.png')
    plt.show()

    # Partial autocorrelation plot
    fig, ax = plt.subplots(figsize=(7, 4))
    plot_pacf(residuals_df, lags=30, ax=ax)
    plt.title(f'Partial Autocorrelation of Residuals_{name}')
    plt.xlabel('Lag')
    plt.ylabel('Partial Autocorrelation')
    plt.savefig(f'../reports/figures/{name}_resid_pacf.png')
    plt.show()
    print('\n')
    model.plot_diagnostics(figsize=(20, 11))
    plt.savefig(f'../reports/figures/{name}_diagnostics.png')
    plt.show()



In [69]:
model_arima_bnb = fit_auto(df, 'BNBUSDT', exog=False)

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                 2592
Model:               SARIMAX(0, 0, 2)   Log Likelihood               -7090.591
Date:                Tue, 27 Jun 2023   AIC                          14191.182
Time:                        21:11:54   BIC                          14220.483
Sample:                    06-01-2018   HQIC                         14201.800
                         - 12-18-2021                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
intercept     -0.0163      0.170     -0.096      0.924      -0.350       0.317
drift          0.0002      0.000      1.704      0.088   -2.63e-05       0.000
ma.L1          0.1337      0.010     13.288      0.0

In [None]:
analysis

In [70]:
model_arima_bnb_exog = fit_auto(df, 'BNBUSDT', exogenous= ['Ret_BTCUSDT', 'Ret_ADAUSDT', 'Ret_ETHUSDT'])

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                 2592
Model:               SARIMAX(5, 0, 0)   Log Likelihood               -6094.465
Date:                Tue, 27 Jun 2023   AIC                          12210.929
Time:                        21:13:23   BIC                          12275.391
Sample:                    06-01-2018   HQIC                         12234.289
                         - 12-18-2021                                         
Covariance Type:                  opg                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
intercept       0.0631      0.108      0.585      0.558      -0.148       0.274
drift          1.6e-05   7.05e-05      0.227      0.820      -0.000       0.000
Ret_BTCUSDT     0.3600      0.026     13.739    

In [71]:
model_arima_btc = fit_auto(df, 'BTCUSDT', exog=False)


                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                 2592
Model:               SARIMAX(0, 0, 2)   Log Likelihood               -6205.835
Date:                Tue, 27 Jun 2023   AIC                          12421.670
Time:                        21:13:38   BIC                          12450.971
Sample:                    06-01-2018   HQIC                         12432.288
                         - 12-18-2021                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
intercept      0.0041      0.112      0.037      0.971      -0.215       0.224
drift       7.923e-05   6.94e-05      1.141      0.254   -5.69e-05       0.000
ma.L1          0.0512      0.011      4.688      0.0

In [72]:
model_arima_btc_exog = fit_auto(df, 'BTCUSDT', exogenous= ['Ret_BNBUSDT', "Ret_ETHUSDT", 'Ret_ADAUSDT'])


                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                 2592
Model:               SARIMAX(0, 0, 1)   Log Likelihood               -4743.635
Date:                Tue, 27 Jun 2023   AIC                           9501.270
Time:                        21:14:17   BIC                           9542.291
Sample:                    06-01-2018   HQIC                          9516.135
                         - 12-18-2021                                         
Covariance Type:                  opg                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
intercept       0.0886      0.074      1.195      0.232      -0.057       0.234
drift       -6.454e-05   4.58e-05     -1.408      0.159      -0.000    2.53e-05
Ret_BNBUSDT     0.1228      0.008     15.339    

In [74]:


model_arima_ada = fit_auto(df, 'ADAUSDT', exog=False)


                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                 2592
Model:               SARIMAX(0, 1, 5)   Log Likelihood               -7322.288
Date:                Tue, 27 Jun 2023   AIC                          14660.575
Time:                        21:18:12   BIC                          14707.454
Sample:                    06-01-2018   HQIC                         14677.563
                         - 12-18-2021                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
intercept      0.0098      0.004      2.219      0.027       0.001       0.019
drift      -6.839e-06   2.88e-06     -2.378      0.017   -1.25e-05    -1.2e-06
ma.L1         -0.8832      0.016    -56.904      0.0

In [75]:
model_arima_ada_exog = fit_auto(df, 'ADAUSDT', exogenous=['Ret_BTCUSDT', 'Ret_ETHUSDT', 'Ret_BNBUSDT'])


                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                 2592
Model:               SARIMAX(0, 0, 1)   Log Likelihood               -6127.400
Date:                Tue, 27 Jun 2023   AIC                          12268.800
Time:                        21:18:30   BIC                          12309.821
Sample:                    06-01-2018   HQIC                         12283.665
                         - 12-18-2021                                         
Covariance Type:                  opg                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
intercept      -0.0585      0.139     -0.421      0.674      -0.331       0.214
drift        4.727e-05   8.16e-05      0.579      0.562      -0.000       0.000
Ret_BTCUSDT     0.2146      0.024      9.073    

In [76]:

model_arima_eth = fit_auto(df, 'ETHUSDT', exog=False)


                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                 2592
Model:               SARIMAX(0, 1, 4)   Log Likelihood               -6900.697
Date:                Tue, 27 Jun 2023   AIC                          13815.394
Time:                        21:19:20   BIC                          13856.413
Sample:                    06-01-2018   HQIC                         13830.259
                         - 12-18-2021                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
intercept  -3.961e-05      0.008     -0.005      0.996      -0.016       0.016
drift      -3.552e-07   5.07e-06     -0.070      0.944   -1.03e-05    9.58e-06
ma.L1         -0.8650      0.013    -64.518      0.0

In [77]:
model_arima_eth = fit_auto(df, 'ETHUSDT', exogenous=['Ret_BTCUSDT', 'Ret_BNBUSDT', 'Ret_ADAUSDT'])




  return np.roots(self.polynomial_reduced_ar)**-1
  return np.roots(self.polynomial_reduced_ma)**-1


                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                 2592
Model:               SARIMAX(3, 0, 0)   Log Likelihood               -5094.692
Date:                Tue, 27 Jun 2023   AIC                          10207.385
Time:                        21:19:47   BIC                          10260.127
Sample:                    06-01-2018   HQIC                         10226.498
                         - 12-18-2021                                         
Covariance Type:                  opg                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
intercept      -0.1082      0.076     -1.422      0.155      -0.257       0.041
drift        7.985e-05   4.65e-05      1.717      0.086   -1.13e-05       0.000
Ret_BTCUSDT     0.6099      0.011     55.495    