In [22]:
# SARIMA example
from sklearn.metrics import mean_squared_error,mean_absolute_error,mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.graphics.gofplots import qqplot
from statsmodels.tsa.stattools import adfuller
from tqdm import tqdm_notebook
from itertools import product
from typing import Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [23]:
def optimize_ARIMA(endog: Union[pd.Series, list], order_list: list, d: int) -> pd.DataFrame:
    
    results = []
    
    for order in tqdm_notebook(order_list):
        try: 
            model = SARIMAX(endog, order=(order[0], d, order[1]), simple_differencing=False).fit(disp=False)
        except:
            continue
            
        aic = model.aic
        results.append([order, aic])
        
    result_df = pd.DataFrame(results)
    result_df.columns = ['(p,q)', 'AIC']
    
    #Sort in ascending order, lower AIC is better
    result_df = result_df.sort_values(by='AIC', ascending=True).reset_index(drop=True)
    
    return result_df

In [24]:
# load dataset
# file_list=['BTCUSD-1h','BTCUSD-all','BTCUSD-N2Y','BTCUSD-N4Y',
#            'ETHUSD-1h','ETHUSD-all','ETHUSD-N2Y','ETHUSD-N4Y',           
#            'USDTUSD-1h','USDTUSD-all','USDTUSD-N2Y','USDTUSD-N4Y',  
#            'BNBUSD-1h','BNBUSD-all','BNBUSD-N2Y','BNBUSD-N4Y'         
#            ]

file_list=['USDCUSD-1h','USDCUSD-all','USDCUSD-N2Y','USDCUSD-N4Y']
spilt_list=[0.7,0.8,0.9]

files_list=list()
spilts_list=list()
Classifiers_list=list()

train_mae=list()
train_rmse=list()
test_mae=list()
test_rmse=list()

Expected_revenues=list()
Actual_revenue=list()
Expected_total_revenues=list()

In [25]:
for file in file_list:
    for splits in spilt_list:
        for p in range(11):
            for q in range(11):
                try:                  
                    #dataset      
                    path =f'../Data/{file}.csv'
                    
                    files_list.append(file)
                    spilts_list.append(splits)
                    dataset = pd.read_csv(path)
                    dataset=dataset['Close']
                    
                    #Split dataset
                    train_data, test_data = train_test_split(dataset, train_size=splits, shuffle=False)

                
                    # #find AR(p),MA(q)
                    # ps = range(0, 20, 1)
                    # qs = range(0, 20, 1)
                    # d = 1

                    # order_list = list(product(ps, qs))

                    # result_df = optimize_ARIMA(train_data, order_list, d)
                    # p=result_df['(p,q)'][0][0]
                    # q=result_df['(p,q)'][0][1]
                    
                    Classifiers_list.append(f'ARIMA({p},1,{q})')

                    

                    # Take the logarithm
                    train_data = np.log(train_data)
                    test_data= np.log(test_data)

                    # Calculate mean and standard deviation
                    mean = np.mean(train_data)
                    std = np.std(train_data)*3

                    # Z-score normalization
                    standardized_train = (train_data - mean) / std



                    model = SARIMAX(standardized_train, order=(p, 1, q), seasonal_order=(0, 0, 0,0))
                    model_fit = model.fit(disp=True)

                    # make prediction
                    test_predict = model_fit.predict(start=len(train_data), end=(len(train_data)+test_data.shape[0]-1))
                    train_predict = model_fit.predict(start=0, end=(train_data.shape[0]-1))
                        

                    original_predict_train = np.exp(train_predict*std+mean)
                    original_predict_test = np.exp(test_predict*std+mean)

                    original_train = np.exp(train_data)
                    original_test = np.exp(test_data)
                            

                
                    train_mae.append(mean_absolute_error(original_predict_train,original_train))
                    train_rmse.append(mean_squared_error(original_predict_train,original_train)**0.5)                
                    
                    test_mae.append(mean_absolute_error(original_predict_test,original_test))
                    test_rmse.append(mean_squared_error(original_predict_test,original_test)**0.5)
                    print(f'{file} -- ARIMA({p},1,{q}) is done')
                except:
                    train_mae.append(0)
                    train_rmse.append(0)                
                    
                    test_mae.append(0)
                    test_rmse.append(0)
                    print(f'{file} -- ARIMA({p},1,{q}) is done')
            

USDCUSD-1h -- ARIMA(0,1,0) is done
USDCUSD-1h -- ARIMA(0,1,1) is done
USDCUSD-1h -- ARIMA(0,1,2) is done
USDCUSD-1h -- ARIMA(0,1,3) is done
USDCUSD-1h -- ARIMA(0,1,4) is done
USDCUSD-1h -- ARIMA(0,1,5) is done
USDCUSD-1h -- ARIMA(0,1,6) is done
USDCUSD-1h -- ARIMA(0,1,7) is done
USDCUSD-1h -- ARIMA(0,1,8) is done
USDCUSD-1h -- ARIMA(0,1,9) is done
USDCUSD-1h -- ARIMA(0,1,10) is done
USDCUSD-1h -- ARIMA(1,1,0) is done
USDCUSD-1h -- ARIMA(1,1,1) is done
USDCUSD-1h -- ARIMA(1,1,2) is done
USDCUSD-1h -- ARIMA(1,1,3) is done
USDCUSD-1h -- ARIMA(1,1,4) is done
USDCUSD-1h -- ARIMA(1,1,5) is done
USDCUSD-1h -- ARIMA(1,1,6) is done
USDCUSD-1h -- ARIMA(1,1,7) is done
USDCUSD-1h -- ARIMA(1,1,8) is done
USDCUSD-1h -- ARIMA(1,1,9) is done
USDCUSD-1h -- ARIMA(1,1,10) is done
USDCUSD-1h -- ARIMA(2,1,0) is done
USDCUSD-1h -- ARIMA(2,1,1) is done
USDCUSD-1h -- ARIMA(2,1,2) is done
USDCUSD-1h -- ARIMA(2,1,3) is done
USDCUSD-1h -- ARIMA(2,1,4) is done
USDCUSD-1h -- ARIMA(2,1,5) is done
USDCUSD-1h -- ARIM

In [21]:
len(train_mae)

1070

In [26]:
Summary={'Data':files_list,'Classifier':Classifiers_list,'Test Split':spilts_list,
            'train_mae':train_mae,          
            'train_rmse':train_rmse,          
            'test_mae':test_mae,           
            'test_rmse':test_rmse            
         }

df_Summary = pd.DataFrame(Summary)
df_Summary.to_excel("Summary-ARIMA-.xlsx",index=False)  
df_Summary

Unnamed: 0,Data,Classifier,Test Split,train_mae,train_rmse,test_mae,test_rmse
0,USDCUSD-1h,"ARIMA(0,1,0)",0.7,0.000086,0.000118,0.000124,0.000154
1,USDCUSD-1h,"ARIMA(0,1,1)",0.7,0.000066,0.000088,0.000136,0.000164
2,USDCUSD-1h,"ARIMA(0,1,2)",0.7,0.000066,0.000088,0.000140,0.000169
3,USDCUSD-1h,"ARIMA(0,1,3)",0.7,0.000066,0.000088,0.000151,0.000178
4,USDCUSD-1h,"ARIMA(0,1,4)",0.7,0.000066,0.000088,0.000152,0.000179
...,...,...,...,...,...,...,...
1447,USDCUSD-N4Y,"ARIMA(10,1,6)",0.9,0.000220,0.001530,0.000104,0.000140
1448,USDCUSD-N4Y,"ARIMA(10,1,7)",0.9,0.000213,0.001529,0.000149,0.000183
1449,USDCUSD-N4Y,"ARIMA(10,1,8)",0.9,0.000212,0.001528,0.000143,0.000177
1450,USDCUSD-N4Y,"ARIMA(10,1,9)",0.9,0.000211,0.001528,0.000150,0.000184
