In [15]:
# SARIMA example
from sklearn.metrics import mean_squared_error,mean_absolute_error,mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.graphics.gofplots import qqplot
from statsmodels.tsa.stattools import adfuller
from tqdm import tqdm_notebook
from itertools import product
from typing import Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [16]:
def optimize_ARIMA(endog: Union[pd.Series, list], order_list: list, d: int) -> pd.DataFrame:
    
    results = []
    
    for order in tqdm_notebook(order_list):
        try: 
            model = SARIMAX(endog, order=(order[0], d, order[1]), simple_differencing=False).fit(disp=False)
        except:
            continue
            
        aic = model.aic
        results.append([order, aic])
        
    result_df = pd.DataFrame(results)
    result_df.columns = ['(p,q)', 'AIC']
    
    #Sort in ascending order, lower AIC is better
    result_df = result_df.sort_values(by='AIC', ascending=True).reset_index(drop=True)
    
    return result_df

In [17]:
# 創建時間窗口的訓練集數據
def create_dataset(closing_prices, time_window):
    features,targets = [],[]
    for i in range(len(closing_prices) - time_window):
        features.append(closing_prices.iloc[i:i+time_window])
        targets.append(closing_prices.iloc[i+time_window])   
    return np.array(features),np.array(targets)

In [18]:
#dataset   
file='BTCUSD-all'
splits=0.7
lb=7

path =f'../Data/{file}.csv'
dataset = pd.read_csv(path)
dataset=dataset['Close']

#Split dataset
train_data, test_data = train_test_split(dataset, train_size=splits, shuffle=False)


trainX, trainY = create_dataset(train_data, lb)
testX, testY = create_dataset(test_data, lb)


#find AR(p),MA(q)
ps = range(0, 5, 1)
qs = range(0, 5, 1)
d = 1

order_list = list(product(ps, qs))

result_df = optimize_ARIMA(trainX, order_list, d)
p=result_df['(p,q)'][0][0]
q=result_df['(p,q)'][0][1]

print(f'ARIMA({p},1,{q})')


# Take the logarithm
train_data = np.log(trainX)
test_data= np.log(testX)

# Calculate mean and standard deviation
mean = np.mean(train_data)
std = np.std(train_data)*3

# # Z-score normalization
standardized_train = (train_data - mean) / std



model = SARIMAX(standardized_train, order=(p, 1, q), seasonal_order=(0, 0, 0,0))
model_fit = model.fit(disp=True)

# # make prediction
test_predict = model_fit.predict(start=len(train_data), end=(len(train_data)+test_data.shape[0]-1))
train_predict = model_fit.predict(start=0, end=(train_data.shape[0]-1))
    

original_predict_train = np.exp(train_predict*std+mean)
original_predict_test = np.exp(test_predict*std+mean)

original_train = trainY
original_test = testY
        
print(f'train mae: {mean_absolute_error(original_predict_train,original_train)}')
print(f'train rmse: {mean_squared_error(original_predict_train,original_train)**0.5}')
print(f'test mae: {mean_absolute_error(original_predict_test,original_test)}')
print(f'test rmse: {mean_squared_error(original_predict_test,original_test)**0.5}')

print(f'{file} -- ARIMA({p},1,{q}) is done')



  0%|          | 0/25 [00:00<?, ?it/s]

ValueError: Length mismatch: Expected axis has 0 elements, new values have 2 elements