In [1]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose, STL
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.graphics.gofplots import qqplot
from statsmodels.tsa.stattools import adfuller
from tqdm import tqdm_notebook
from itertools import product
from typing import Union

import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np

import datetime
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
df = pd.read_csv('data\BTC_final.csv')
df.head()

Unnamed: 0,index,Date,Open,High,Low,Close,Adj Close,Volume,SMA_7,SMA_14,EMA_7,EMA_14,RSI,Stoch_RSI,Upper_BB,Lower_BB,MACD,ScorePositive,ScoreNegative,ScoreNeutral
0,27,2018-01-28,11475.299805,12040.299805,11475.299805,11786.299805,11786.299805,8350360000.0,11259.571708,11635.535854,11516.047002,11884.539097,37.129744,0.560353,15192.188206,9606.291969,-761.452864,0.023502,0.057272,0.919226
1,28,2018-01-29,11755.5,11875.599609,11179.200195,11296.400391,11296.400391,7107360000.0,11311.714565,11455.293039,11461.122271,11804.864485,34.531168,0.83516,14866.144405,9602.435771,-744.090581,0.163158,0.294283,0.542558
2,29,2018-01-30,11306.799805,11307.200195,10036.200195,10106.299805,10106.299805,8637860000.0,11202.843052,11356.421596,11122.356157,11575.251876,40.137091,1.0,14450.867575,9531.012601,-816.944788,0.102038,0.260553,0.63741
3,30,2018-01-31,10108.200195,10381.599609,9777.419922,10221.099609,10221.099609,8041160000.0,11040.228655,11287.314453,10897.011838,11392.534553,42.91738,1.0,14318.165125,9345.245031,-855.556563,0.098884,0.268044,0.633072
4,31,2018-02-01,10237.299805,10288.799805,8812.280273,9170.540039,9170.540039,9959400000.0,10741.820033,11122.717285,10465.350527,11093.196564,34.828472,0.651247,14132.400116,9050.004084,-959.863343,0.075772,0.452595,0.471633


In [3]:
target = df['Adj Close']
exog = df[['Open', 'High', 'Low', 'Volume', 'SMA_7','SMA_14','EMA_7','EMA_14','RSI','Stoch_RSI','Upper_BB','Lower_BB','MACD','ScorePositive','ScoreNegative','ScoreNeutral']]

In [4]:
ad_fuller_result = adfuller(target)

print(f'ADF Statistic: {ad_fuller_result[0]}')
print(f'p-value: {ad_fuller_result[1]}')

ADF Statistic: -1.3639653753586114
p-value: 0.599424531921265


In [5]:
target_diff = target.diff()

ad_fuller_result = adfuller(target_diff[1:])

print(f'ADF Statistic: {ad_fuller_result[0]}')
print(f'p-value: {ad_fuller_result[1]}')

ADF Statistic: -8.311394347344411
p-value: 3.78050429631799e-13


In [6]:
def optimize_SARIMAX(endog: Union[pd.Series, list], exog: Union[pd.Series, list], order_list: list, d: int, D: int, s: int) -> pd.DataFrame:
    
    results = []
    
    for order in tqdm_notebook(order_list):
        try: 
            model = SARIMAX(
                endog,
                exog,
                order=(order[0], d, order[1]),
                seasonal_order=(order[2], D, order[3], s),
                simple_differencing=False).fit(disp=False)
        except:
            continue
            
        aic = model.aic
        results.append([order, aic])
        
    result_df = pd.DataFrame(results)
    result_df.columns = ['(p,q,P,Q)', 'AIC']
    
    #Sort in ascending order, lower AIC is better
    result_df = result_df.sort_values(by='AIC', ascending=True).reset_index(drop=True)
    
    return result_df

In [7]:
p = range(0, 5, 1)
d = 1
q = range(0, 5, 1)
P = range(0, 3, 1)
D = 0
Q = range(0, 3, 1)
s = 365

parameters = product(p, q, P, Q)
parameters_list = list(parameters)

In [8]:
target_train = target[:200]
exog_train = exog[:200]

result_df = optimize_SARIMAX(target_train, exog_train, parameters_list, d, D, s)
result_df

  0%|          | 0/225 [00:00<?, ?it/s]

MemoryError: Unable to allocate 822. MiB for an array with shape (732, 732, 201) and data type float64

: 

: 

In [28]:
best_model = SARIMAX(target_train, exog_train, order=(3,1,3), seasonal_order=(0,0,0,7), simple_differencing=False)
best_model_fit = best_model.fit(disp=False)

print(best_model_fit.summary())

                               SARIMAX Results                                
Dep. Variable:                  Close   No. Observations:                 2151
Model:               SARIMAX(3, 1, 3)   Log Likelihood              -13484.530
Date:                Sun, 30 Apr 2023   AIC                          26995.060
Time:                        01:44:59   BIC                          27068.812
Sample:                             0   HQIC                         27022.041
                               - 2151                                         
Covariance Type:                  opg                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Open              -0.6050      0.004   -142.394      0.000      -0.613      -0.597
High               0.6945      0.005    135.740      0.000       0.684       0.705
Low                0.6169      0.005

In [29]:
def recursive_forecast(endog: Union[pd.Series, list], exog: Union[pd.Series, list], train_len: int, horizon: int, window: int, method: str) -> list:
    
    total_len = train_len + horizon

    if method == 'last':
        pred_last_value = []
        
        for i in range(train_len, total_len, window):
            last_value = endog[:i].iloc[-1]
            pred_last_value.extend(last_value for _ in range(window))
            
        return pred_last_value
    
    elif method == 'SARIMAX':
        pred_SARIMAX = []
        
        for i in range(train_len, total_len, window):
            model = SARIMAX(endog[:i], exog[:i], order=(3,1,3), seasonal_order=(0,0,0,7), simple_differencing=False)
            res = model.fit(disp=False)
            predictions = res.get_prediction(exog=exog)
            oos_pred = predictions.predicted_mean.iloc[-window:]
            pred_SARIMAX.extend(oos_pred)
            
        return pred_SARIMAX

In [30]:
target_train = target[:3066]
target_test = target[3066:]

pred_df = pd.DataFrame({'actual': target_test})

TRAIN_LEN = len(target_train)
HORIZON = len(target_test)
WINDOW = 1

pred_SARIMAX = recursive_forecast(target, exog, TRAIN_LEN, HORIZON, WINDOW, 'SARIMAX')

pred_df['pred_SARIMAX'] = pred_SARIMAX

pred_df

Unnamed: 0,actual,pred_SARIMAX
3066,24188.84375,24768.773977
3067,23947.49219,23891.057892
3068,23198.12695,24443.585731
3069,23175.375,23356.189465
3070,23561.21289,22893.682834
3071,23522.87109,23640.721608
3072,23433.81641,23565.520908


In [31]:
mean_absolute_error(pred_df.actual, pred_df.pred_SARIMAX)

425.67469182453505