In [5]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.graphics.gofplots import qqplot
from statsmodels.tsa.stattools import adfuller
from tqdm import tqdm_notebook
from itertools import product
from typing import Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [6]:
df = pd.read_csv('FinBERT\Data_final\BTC_final.csv')
df=df.drop('Unnamed: 0',axis=1)
df

Unnamed: 0,CoinScore,CoinDate,Open,High,Low,Close,Volume
0,-0.224409,2014-10-01,387.427002,391.378998,380.779999,383.614990,2.622940e+07
1,0.162439,2014-10-02,383.988007,385.497009,372.946014,375.071991,2.177770e+07
2,-0.136249,2014-10-03,375.181000,377.695007,357.859009,359.511993,3.090120e+07
3,-0.134887,2014-10-04,359.891998,364.487000,325.885986,328.865997,4.723650e+07
4,-0.134887,2014-10-05,328.915985,341.800995,289.295990,320.510010,8.330810e+07
...,...,...,...,...,...,...,...
3068,-0.530137,2023-02-24,23946.007810,24103.705080,23007.072270,23198.126950,2.681174e+10
3069,-0.524836,2023-02-25,23200.125000,23210.210940,22861.558590,23175.375000,1.610072e+10
3070,-0.524836,2023-02-26,23174.150390,23654.367190,23084.220700,23561.212890,1.664453e+10
3071,-0.122506,2023-02-27,23561.451170,23857.890630,23205.878910,23522.871090,2.266076e+10


In [7]:
#將時間轉換為數值，以方便後續可以運算
timestamp_s = pd.to_datetime(df['CoinDate']).map(datetime.datetime.timestamp)
df['Coin_timestamp'] = timestamp_s
df = df.drop(['CoinDate'], axis=1)
df

Unnamed: 0,CoinScore,Open,High,Low,Close,Volume,Coin_timestamp
0,-0.224409,387.427002,391.378998,380.779999,383.614990,2.622940e+07,1.412093e+09
1,0.162439,383.988007,385.497009,372.946014,375.071991,2.177770e+07,1.412179e+09
2,-0.136249,375.181000,377.695007,357.859009,359.511993,3.090120e+07,1.412266e+09
3,-0.134887,359.891998,364.487000,325.885986,328.865997,4.723650e+07,1.412352e+09
4,-0.134887,328.915985,341.800995,289.295990,320.510010,8.330810e+07,1.412438e+09
...,...,...,...,...,...,...,...
3068,-0.530137,23946.007810,24103.705080,23007.072270,23198.126950,2.681174e+10,1.677168e+09
3069,-0.524836,23200.125000,23210.210940,22861.558590,23175.375000,1.610072e+10,1.677254e+09
3070,-0.524836,23174.150390,23654.367190,23084.220700,23561.212890,1.664453e+10,1.677341e+09
3071,-0.122506,23561.451170,23857.890630,23205.878910,23522.871090,2.266076e+10,1.677427e+09


In [8]:
ad_fuller_result = adfuller(df['Close'])

print(f'ADF Statistic: {ad_fuller_result[0]}')
print(f'p-value: {ad_fuller_result[1]}')

ADF Statistic: -1.6052963344633844
p-value: 0.4809324510344754


In [9]:
eps_diff = np.diff(df['Close'], n=1)

ad_fuller_result = adfuller(eps_diff)

print(f'ADF Statistic: {ad_fuller_result[0]}')
print(f'p-value: {ad_fuller_result[1]}')

ADF Statistic: -9.282630023506483
p-value: 1.2393192939570603e-15


In [10]:
def optimize_ARIMA(endog: Union[pd.Series, list], order_list: list, d: int) -> pd.DataFrame:
    
    results = []
    
    for order in tqdm_notebook(order_list):
        try: 
            model = SARIMAX(endog, order=(order[0], d, order[1]), simple_differencing=False).fit(disp=False)
        except:
            continue
            
        aic = model.aic
        results.append([order, aic])
        
    result_df = pd.DataFrame(results)
    result_df.columns = ['(p,q)', 'AIC']
    
    #Sort in ascending order, lower AIC is better
    result_df = result_df.sort_values(by='AIC', ascending=True).reset_index(drop=True)
    
    return result_df

In [11]:
ps = range(0, 4, 1)
qs = range(0, 4, 1)
d = 1

order_list = list(product(ps, qs))

In [12]:
train = df['Close'][:-7]

result_df = optimize_ARIMA(train, order_list, d)
result_df

  0%|          | 0/16 [00:00<?, ?it/s]

Unnamed: 0,"(p,q)",AIC
0,"(2, 2)",49608.213115
1,"(3, 2)",49610.242912
2,"(2, 3)",49610.26869
3,"(3, 3)",49611.928034
4,"(0, 0)",49627.635054
5,"(1, 0)",49627.731486
6,"(0, 1)",49627.754281
7,"(1, 2)",49628.890456
8,"(0, 2)",49629.649473
9,"(2, 0)",49629.667179


In [13]:
model = SARIMAX(train, order=(2,1,2), simple_differencing=False)
model_fit = model.fit(disp=False)

print(model_fit.summary())

                               SARIMAX Results                                
Dep. Variable:                  Close   No. Observations:                 3066
Model:               SARIMAX(2, 1, 2)   Log Likelihood              -24799.107
Date:                Sun, 30 Apr 2023   AIC                          49608.213
Time:                        01:15:06   BIC                          49638.352
Sample:                             0   HQIC                         49619.042
                               - 3066                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.8179      0.009     88.343      0.000       0.800       0.836
ar.L2         -0.9498      0.009   -109.141      0.000      -0.967      -0.933
ma.L1         -0.8474      0.009    -94.408      0.0

In [14]:
test = df.iloc[-7:]
test

Unnamed: 0,CoinScore,Open,High,Low,Close,Volume,Coin_timestamp
3066,-0.138639,24437.41797,24472.33984,23644.31836,24188.84375,30200000000.0,1676995000.0
3067,-0.73681,24190.71875,24572.08984,23693.91992,23947.49219,30476260000.0,1677082000.0
3068,-0.530137,23946.00781,24103.70508,23007.07227,23198.12695,26811740000.0,1677168000.0
3069,-0.524836,23200.125,23210.21094,22861.55859,23175.375,16100720000.0,1677254000.0
3070,-0.524836,23174.15039,23654.36719,23084.2207,23561.21289,16644530000.0,1677341000.0
3071,-0.122506,23561.45117,23857.89063,23205.87891,23522.87109,22660760000.0,1677427000.0
3072,-0.736306,23512.17773,23521.54688,23400.39648,23433.81641,22190760000.0,1677514000.0


In [15]:
ARIMA_pred = model_fit.get_prediction(3066, 3072).predicted_mean

test['ARIMA_pred'] = ARIMA_pred
test

Unnamed: 0,CoinScore,Open,High,Low,Close,Volume,Coin_timestamp,ARIMA_pred
3066,-0.138639,24437.41797,24472.33984,23644.31836,24188.84375,30200000000.0,1676995000.0,24453.341954
3067,-0.73681,24190.71875,24572.08984,23693.91992,23947.49219,30476260000.0,1677082000.0,24451.155648
3068,-0.530137,23946.00781,24103.70508,23007.07227,23198.12695,26811740000.0,1677168000.0,24433.232437
3069,-0.524836,23200.125,23210.21094,22861.55859,23175.375,16100720000.0,1677254000.0,24420.64996
3070,-0.524836,23174.15039,23654.36719,23084.2207,23561.21289,16644530000.0,1677341000.0,24427.38196
3071,-0.122506,23561.45117,23857.89063,23205.87891,23522.87109,22660760000.0,1677427000.0,24444.838346
3072,-0.736306,23512.17773,23521.54688,23400.39648,23433.81641,22190760000.0,1677514000.0,24452.721659


In [16]:
mean_squared_error(test['Close'], test['ARIMA_pred'])

862610.2651712224