## Check Stationarity and normalize

In [35]:
#https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
#https://www.analyticsvidhya.com/blog/2018/09/multivariate-time-series-guide-forecasting-modeling-python-codes/
#https://www.analyticsvidhya.com/blog/2018/09/non-stationary-time-series-python/

In [1]:
import data_prep_helper
from statsmodels.tsa.vector_ar.var_model import VAR
import pandas as pd
from math import sqrt
from sklearn.metrics import mean_squared_error
import plotly.graph_objects as go
from plotly.subplots import make_subplots

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


In [14]:
corr_df = data_prep_helper.get_corr_df()

In [15]:
corr_df = corr_df.set_index("bitcoin_Date")
cols = corr_df.columns

In [4]:
from matplotlib import pyplot
from statsmodels.tsa.seasonal import seasonal_decompose
# Input dataframe df has prices under the column name 'close'

series = pd.DataFrame(data=corr_df['bitcoin_Price'].values, index=pd.to_datetime(corr_df.index), columns = ['bitcoin_Price']).dropna()
result = seasonal_decompose(series.values, model='multiplicative', freq=30)
result.plot()
pyplot.show()

<Figure size 640x480 with 4 Axes>

### AdFuller Test

In [5]:
#define function for ADF test
from statsmodels.tsa.stattools import adfuller
def adf_test(timeseries):
    #Perform Dickey-Fuller test:
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
       dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

#apply adf test on the series
adf_test(corr_df['bitcoin_Price'])

Results of Dickey-Fuller Test:
Test Statistic                   -1.657399
p-value                           0.453185
#Lags Used                        3.000000
Number of Observations Used    1270.000000
Critical Value (1%)              -3.435510
Critical Value (5%)              -2.863818
Critical Value (10%)             -2.567983
dtype: float64


As the value of the test statistic < critical value at all confidence intervals, we reject the null hypothesis that the series has a unit root. --> is stationary

### Kwiatkowski–Phillips–Schmidt–Shin

In [6]:
#define function for kpss test
from statsmodels.tsa.stattools import kpss
#define KPSS
def kpss_test(timeseries):
    print ('Results of KPSS Test:')
    kpsstest = kpss(timeseries, regression='c')
    kpss_output = pd.Series(kpsstest[0:3], index=['Test Statistic','p-value','Lags Used'])
    for key,value in kpsstest[3].items():
        kpss_output['Critical Value (%s)'%key] = value
    print (kpss_output)
    
kpss_test(corr_df['bitcoin_Price'])

Results of KPSS Test:
Test Statistic            3.724092
p-value                   0.010000
Lags Used                23.000000
Critical Value (10%)      0.347000
Critical Value (5%)       0.463000
Critical Value (2.5%)     0.574000
Critical Value (1%)       0.739000
dtype: float64



p-value is smaller than the indicated p-value



As the value of the test statistic > the critical value at all confidence intervals, we reject the null hypothesis that the series is stationary. --> isn't stationary

So in summary, the ADF test has an alternate hypothesis of linear or difference stationary, while the KPSS test identifies trend-stationarity in a series.

In [7]:
#corr_df['bitcoin_Price_diff'] = corr_df['bitcoin_Price'] - corr_df['bitcoin_Price'].shift(1)
#corr_df['bitcoin_Price_diff'].fillna(0,inplace=True)

In [8]:
#creating the train and validation set
train = corr_df[:int(0.8*(len(corr_df)))]
valid = corr_df[int(0.8*(len(corr_df))):]
valid_index = valid.index

In [9]:
model = VAR(endog=train)
model_fit = model.fit()

# make prediction on validation
prediction = model_fit.forecast(model_fit.y, steps=len(valid))

#converting predictions to dataframe

cols = train.columns
valid_pred = pd.DataFrame(prediction, columns=cols, index=valid_index)

#check rmse
for i in cols:
    print('rmse value for', i, 'is : ', sqrt(mean_squared_error(valid_pred[i], valid[i])))

rmse value for bitcoin_Price is :  3210.4006127992357
rmse value for bitcoin_High is :  3258.0131321051704
rmse value for bitcoin_Low is :  3134.440146723785
rmse value for bitcoin_30_day_ma is :  3096.263416730784
rmse value for bitcoin_30_day_std is :  278.2897710577435
rmse value for bitcoin_boll_upp is :  3303.2622145888095
rmse value for bitcoin_boll_low is :  2980.22056342203
rmse value for sp500_Price is :  377.5470764642217
rmse value for dax_Price is :  722.4394135828296
rmse value for googl_Price is :  216.64402082019032
rmse value for bitcoin_Google_Trends is :  506.8258663573864
rmse value for cryptocurrency_Google_Trends is :  57.85813835223614
rmse value for trading_Google_Trends is :  502.30808154788633
rmse value for bitcoin_pos_sents is :  0.024136555402127065
rmse value for bitcoin_neg_sents is :  0.01919623087634543
rmse value for bitcoin_quot_sents is :  0.6284196106237591
rmse value for economy_pos_sents is :  0.01669838489893981
rmse value for economy_neg_sents is


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.



In [10]:
df_cols = ["bitcoin_Price", "economy_quot_sents", "bitcoin_quot_sents", "bitcoin_Google_Trends"]

valid_01 = valid[df_cols]
model = VAR(endog=train[df_cols])
model_fit = model.fit()

# make prediction on validation
prediction = model_fit.forecast(model_fit.y, steps=len(valid_01))

#converting predictions to dataframe

cols = df_cols
valid_pred = pd.DataFrame(prediction, columns=cols, index=valid_index)

#check rmse
for i in cols:
    print('rmse value for', i, 'is : ', sqrt(mean_squared_error(valid_pred[i], valid[i])))

rmse value for bitcoin_Price is :  4652.601051562208
rmse value for economy_quot_sents is :  0.3147175348638063
rmse value for bitcoin_quot_sents is :  1.054425612350079
rmse value for bitcoin_Google_Trends is :  661.2911704269701



A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.



In [16]:
#make final predictions
num_forcast = 5
 
model = VAR(endog=corr_df)
model_fit = model.fit()
yhat = model_fit.forecast(model_fit.y, steps=num_forcast)

yhat_df = pd.DataFrame(yhat, columns=cols)

forecast_dates = pd.date_range(start=corr_df.index[-1], periods=num_forcast+1)[1:]

yhat_df = yhat_df.set_index(forecast_dates)


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.



In [17]:
fig = make_subplots(
    rows=2, 
    cols=1, 
    shared_xaxes=True, 
    vertical_spacing=0.2,
    subplot_titles=(["Bitcoin Price Chart<br>with Validation<br>and Forecast",
                     "Bitcoin 30-day-Mean"])
)

fig.add_trace(go.Scatter(x=corr_df.index, 
                         y=corr_df['bitcoin_Price'],
                         name="BITCOIN Closing Price"), row=1, col=1)

fig.add_trace(go.Scatter(x=valid_pred.index, 
                         y=valid_pred['bitcoin_Price'],
                         name="BITCOIN Validation Prediction"), row=1, col=1)

fig.add_trace(go.Scatter(x=yhat_df.index, 
                         y=yhat_df['bitcoin_Price'],
                         name="BITCOIN Current Forecast"), row=1, col=1)

fig.add_trace(go.Scatter(x=corr_df.index, 
                         y=corr_df['bitcoin_30_day_ma'],
                         name="BITCOIN 30 Closing Price"), row=2, col=1)

fig.add_trace(go.Scatter(x=valid_pred.index, 
                         y=valid_pred['bitcoin_30_day_ma'],
                         name="Validation Prediction"), row=2, col=1)

fig.add_trace(go.Scatter(x=yhat_df.index, 
                         y=yhat_df['bitcoin_30_day_ma'],
                         name="Current Forecast"), row=2, col=1)

fig.update_layout(height=1000, width=1500, title_text="Bitcoin Prediction using VAR (Vector Autoregression)")


KeyError: 'bitcoin_30_day_ma'