In [1]:
# SARIMA example
from sklearn.metrics import mean_squared_error,mean_absolute_error,mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.graphics.gofplots import qqplot
from statsmodels.tsa.stattools import adfuller
from tqdm import tqdm_notebook
from itertools import product
from typing import Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
def optimize_ARIMA(endog: Union[pd.Series, list], order_list: list, d: int) -> pd.DataFrame:
    
    results = []
    
    for order in tqdm_notebook(order_list):
        try: 
            model = SARIMAX(endog, order=(order[0], d, order[1]), simple_differencing=False).fit(disp=False)
        except:
            continue
            
        aic = model.aic
        results.append([order, aic])
        
    result_df = pd.DataFrame(results)
    result_df.columns = ['(p,q)', 'AIC']
    
    #Sort in ascending order, lower AIC is better
    result_df = result_df.sort_values(by='AIC', ascending=True).reset_index(drop=True)
    
    return result_df

In [4]:
# load dataset
#  
file_list=['BTCUSD-1h','BTCUSD-all','BTCUSD-N2Y','BTCUSD-N4Y',
           'ETHUSD-1h','ETHUSD-all','ETHUSD-N2Y','ETHUSD-N4Y',
           'USDCUSD-1h','USDCUSD-all','USDCUSD-N2Y','USDCUSD-N4Y',
           'USDTUSD-1h','USDTUSD-all','USDTUSD-N2Y','USDTUSD-N4Y',  
           'BNBUSD-1h','BNBUSD-all','BNBUSD-N2Y','BNBUSD-N4Y'         
           ]


spilt_list=[0.7,0.8,0.9]

files_list=list()
spilts_list=list()
Classifiers_list=list()

train_mae=list()
train_rmse=list()
test_mae=list()
test_rmse=list()

Expected_revenues=list()
Actual_revenue=list()
Expected_total_revenues=list()

In [6]:
for file in file_list:
    for splits in spilt_list:       
            #dataset      
            path =f'../Data/{file}.csv'
            files_list.append(file)
            spilts_list.append(splits)
            dataset = pd.read_csv(path)
            dataset=dataset['Close']
            
            #Split dataset
            train_data, test_data = train_test_split(dataset, train_size=splits, shuffle=False)

         
            #find AR(p),MA(q)
            ps = range(0, 20, 1)
            qs = range(0, 20, 1)
            d = 1

            order_list = list(product(ps, qs))

            result_df = optimize_ARIMA(train_data, order_list, d)
            p=result_df['(p,q)'][0][0]
            q=result_df['(p,q)'][0][1]
            
            Classifiers_list.append(f'ARIMA({p},1,{q})')

            

            # Take the logarithm
            train_data = np.log(train_data)
            test_data= np.log(test_data)

            # Calculate mean and standard deviation
            mean = np.mean(train_data)
            std = np.std(train_data)*3

            # Z-score normalization
            standardized_train = (train_data - mean) / std



            model = SARIMAX(standardized_train, order=(p, 1, q), seasonal_order=(0, 0, 0,0))
            model_fit = model.fit(disp=True)

            # make prediction
            test_predict = model_fit.predict(start=len(train_data), end=(len(train_data)+test_data.shape[0]-1))
            train_predict = model_fit.predict(start=0, end=(train_data.shape[0]-1))
                

            original_predict_train = np.exp(train_predict*std+mean)
            original_predict_test = np.exp(test_predict*std+mean)

            original_train = np.exp(train_data)
            original_test = np.exp(test_data)
                    

        
            train_mae.append(mean_absolute_error(original_predict_train,original_train))
            train_rmse.append(mean_squared_error(original_predict_train,original_train)**0.5)                
            
            test_mae.append(mean_absolute_error(original_predict_test,original_test))
            test_rmse.append(mean_squared_error(original_predict_test,original_test)**0.5)
                

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

In [19]:
files_list=list()
spilts_list=list()
for file in file_list:
    for splits in spilt_list:       
            files_list.append(file)
            spilts_list.append(splits)
           
            

In [20]:
len(files_list)

75

In [21]:
Summary={'Data':files_list,'Classifier':Classifiers_list,'Test Split':spilts_list,
            'train_mae':train_mae,          
            'train_rmse':train_rmse,          
            'test_mae':test_mae,           
            'test_rmse':test_rmse            
         }

df_Summary = pd.DataFrame(Summary)
df_Summary.to_excel("Summary-ARIMA(Log).xlsx",index=False)  
df_Summary

Unnamed: 0,Data,Classifier,Test Split,train_mae,train_rmse,test_mae,test_rmse
0,BTCUSD-1h,"ARIMA(6,1,2)",0.7,62.258826,105.157046,3901.379719,4111.445697
1,BTCUSD-1h,"ARIMA(6,1,6)",0.8,78.751548,143.459071,429.133890,498.151600
2,BTCUSD-1h,"ARIMA(4,1,2)",0.9,81.310244,152.755449,264.501859,304.973943
3,BTCUSD-1h1Y,"ARIMA(8,1,14)",0.7,65.613703,122.960149,2758.822136,3184.129070
4,BTCUSD-1h1Y,"ARIMA(18,1,13)",0.8,64.175238,121.980097,2672.798982,2939.808920
...,...,...,...,...,...,...,...
70,BNBUSD-N2Y,"ARIMA(17,1,17)",0.8,8.376402,15.774618,45.445953,57.539162
71,BNBUSD-N2Y,"ARIMA(13,1,16)",0.9,8.038146,15.130773,21.163225,25.203628
72,BNBUSD-N4Y,"ARIMA(6,1,12)",0.7,9.592688,18.508471,60.293308,71.126221
73,BNBUSD-N4Y,"ARIMA(5,1,16)",0.8,8.958384,17.478663,38.960182,41.569194


In [12]:
for file in file_list:          
        #dataset      
        path =f'../Data/Ndata/{file}.csv'
        dataset = pd.read_csv(path)
        print(f'file: {file} shape: {dataset.shape}')

file: BTCUSD-1h shape: (743, 7)
file: BTCUSD-1h1Y shape: (9482, 7)
file: BTCUSD-all shape: (3332, 7)
file: BTCUSD-N2Y shape: (2601, 7)
file: BTCUSD-N4Y shape: (1871, 7)
file: ETHUSD-1h shape: (743, 7)
file: ETHUSD-1h1Y shape: (9480, 7)
file: ETHUSD-all shape: (2183, 7)
file: ETHUSD-N2Y shape: (1453, 7)
file: ETHUSD-N4Y shape: (722, 7)
file: USDCUSD-1h shape: (743, 7)
file: USDCUSD-1h1Y shape: (9482, 7)
file: USDCUSD-all shape: (1850, 7)
file: USDCUSD-N2Y shape: (1119, 7)
file: USDCUSD-N4Y shape: (389, 7)
file: USDTUSD-1h shape: (743, 7)
file: USDTUSD-1h1Y shape: (9482, 7)
file: USDTUSD-all shape: (2183, 7)
file: USDTUSD-N2Y shape: (1453, 7)
file: USDTUSD-N4Y shape: (722, 7)
file: BNBUSD-1h shape: (743, 7)
file: BNBUSD-1h1Y shape: (9482, 7)
file: BNBUSD-all shape: (2183, 7)
file: BNBUSD-N2Y shape: (1453, 7)
file: BNBUSD-N4Y shape: (722, 7)
