Import Packages

In [1]:
import numpy as np
import pandas as pd
import scipy
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from statsmodels.tsa.arima_model import ARIMA
from arch import arch_model
import seaborn as sns
import yfinance
import warnings
warnings.filterwarnings("ignore")
sns.set()
from pmdarima.arima import auto_arima

Loading the data

In [2]:
raw_data = yfinance.download (tickers = "^GSPC ^FTSE ^N225 ^GDAXI", start = "2000-01-03", end = "2020-01-31", 
                              interval = "1d", group_by = 'ticker', auto_adjust = True, treads = True)

[*********************100%***********************]  4 of 4 downloaded


In [3]:
df_com = raw_data.copy()
df_com['spx'] = df_com['^GSPC'].Close[:]
df_com['dax'] = df_com['^GDAXI'].Close[:]
df_com['ftse'] = df_com['^FTSE'].Close[:]
df_com['nikkei'] = df_com['^N225'].Close[:]

In [4]:
df_com = df_com.iloc[1:]
del df_com['^N225']
del df_com['^GSPC']
del df_com['^GDAXI']
del df_com['^FTSE']
df_com=df_com.asfreq('b')
df_com=df_com.fillna(method='ffill')

In [5]:
df_com['ret_spx'] = df_com.spx.pct_change(1)*100
df_com['ret_ftse'] = df_com.ftse.pct_change(1)*100
df_com['ret_dax'] = df_com.dax.pct_change(1)*100
df_com['ret_nikkei'] = df_com.nikkei.pct_change(1)*100

In [6]:
size = int(len(df_com)*0.8)
df, df_test = df_com.iloc[:size], df_com.iloc[size:]

Auto ARIMA

In [11]:
model_auto = auto_arima(df_com.ret_spx[1:], exogenous = df_com[['ret_ftse', 'ret_dax', 'ret_nikkei']][1:], m = 5,
                       max_order = None, max_p = 7, max_q = 7, max_d = 2, max_P = 4, max_Q = 4, max_D = 2,
                       maxiter = 50, alpha = 0.05, n_jobs = -1, trend = 'ct', information_criterion = 'oob',
                       out_of_sample = int(len(df_com)*0.2))


# !!! Important Note: In pdmarima v1.5.2, out_of_sample_size is replaced with out_of_sample, so make sure to use the latter!


# exogenous -> outside factors (e.g other time series)
# m -> seasonal cycle length
# max_order -> maximum amount of variables to be used in the regression (p + q)
# max_p -> maximum AR components
# max_q -> maximum MA components
# max_d -> maximum Integrations
# maxiter -> maximum iterations we're giving the model to converge the coefficients (becomes harder as the order increases)
# alpha -> level of significance, default is 5%, which we should be using most of the time
# n_jobs -> how many models to fit at a time (-1 indicates "as many as possible")
# trend -> "ct" usually
# information_criterion -> 'aic', 'aicc', 'bic', 'hqic', 'oob' 
#        (Akaike Information Criterion, Corrected Akaike Information Criterion,
#        Bayesian Information Criterion, Hannan-Quinn Information Criterion, or
#        "out of bag"--for validation scoring--respectively)
# out_of_smaple -> validates the model selection (pass the entire dataset, and set 20% to be the out_of_sample_size)

In [12]:
model_auto.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,5238.0
Model:,"SARIMAX(2, 0, 2)x(1, 0, [1], 5)",Log Likelihood,-6809.091
Date:,"Tue, 03 Mar 2020",AIC,13642.182
Time:,01:18:54,BIC,13720.947
Sample:,01-05-2000,HQIC,13669.723
,- 01-31-2020,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-0.0218,0.126,-0.173,0.862,-0.268,0.224
drift,4.98e-06,2.89e-05,0.172,0.863,-5.16e-05,6.16e-05
ret_ftse,0.1557,0.012,13.447,0.000,0.133,0.178
ret_dax,0.3903,0.009,44.456,0.000,0.373,0.407
ret_nikkei,-0.0320,0.006,-5.194,0.000,-0.044,-0.020
ar.L1,0.4305,3.762,0.114,0.909,-6.942,7.804
ar.L2,-0.0216,0.305,-0.071,0.944,-0.620,0.577
ma.L1,-0.7494,3.762,-0.199,0.842,-8.122,6.624
ma.L2,0.1347,1.502,0.090,0.929,-2.810,3.080

0,1,2,3
Ljung-Box (Q):,91.27,Jarque-Bera (JB):,10326.61
Prob(Q):,0.0,Prob(JB):,0.0
Heteroskedasticity (H):,0.55,Skew:,-0.24
Prob(H) (two-sided):,0.0,Kurtosis:,9.86
