In [None]:
import pandas as pd
import os

In [None]:
df = pd.read_csv("./complete-merged-df.csv", index_col=0, parse_dates=True)
df.head()

In [None]:
df.columns

In [None]:
print("NaN values: ",len(df[df.isna().any(axis=1)]))

# Feature selection

In [None]:
from scipy.stats import linregress
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

def correlation_test(v1,v2, graph=False):
    coef = np.corrcoef(v1, v2)
    linreg = linregress(v1, v2)
    print("Coef : ", coef)
    print("Linear Regression results : ")
    print(f"\tp = {round(linreg.pvalue,5)}")
    print(f"\tslope = {round(linreg.slope,5)}")
    print(f"\tintercept = {round(linreg.intercept,5)}")
    print(f"\tstd. err = {round(linreg.stderr, 5)}")
    print("-----------------------------")
    if graph:
        plt.scatter(v1, v2)
        plt.show()

# correlation to btc closing price and miner revenue
def corr_plot(y_label, y_data, y2_label, y2_data, x):
    fig, ax1= plt.subplots()
    fig.set_figwidth(18)
    fig.set_figheight(10)

    ax1.set_xlabel('Date')
    ax1.tick_params(axis='x', bottom=False, labelbottom=False)

    ax1.set_ylabel(y_label, color='red', size='x-large')
    ax1.tick_params(axis='y', labelcolor='red', labelsize='large')
    ax1.plot(x, y_data, color='red')

    axprecip = ax1.twinx()
    axprecip.set_ylabel(y2_label, color='blue', size='x-large')
    axprecip.tick_params(axis='y', labelcolor='blue', labelsize='large')
    axprecip.plot(x, y2_data, color='blue')

corr_plot(
    "Bitcoin Price", df["close"],
    "SVI", df["SVI"],
    df.index)

correlation_test(df['n-transactions'],  df['SVI'])
correlation_test(df['close'],  df['cost-per-transaction'])
correlation_test(df['close'],  df['hash-rate'])
correlation_test(df['close'],  df['n-transactions'])
correlation_test(df['close'],  df['Gold price'])

In [None]:
from sklearn.linear_model import LinearRegression
from statsmodels.regression.linear_model import OLS
import statsmodels.api as sm

predictors = df.drop(columns=["close"])

y = df['close'] #define response variable
X = sm.add_constant(predictors) #add constant to predictor variables

#fit regression model
model = sm.OLS(y, X).fit()
model.summary() #view AIC of model

# Unit Root Testing

In [None]:
from statsmodels.tsa.stattools import adfuller

def make_stationary(data: pd.Series, alpha: float = 0.05, max_diff_order: int = 10) -> dict:
    # Test to see if the time series is already stationary
    if adfuller(data)[1] < alpha:
        return {
            'differencing_order': 0,
            'time_series': np.array(data)
        }

    p_values = [] # A list to store P-Values
    # Test for differencing orders from 1 to max_diff_order (included)
    for i in range(1, max_diff_order + 1):
        result = adfuller(data.diff(i).fillna(data.mean())) # Perform ADF test
        p_values.append((i, result[1])) # Append P-value
        
    significant = [p for p in p_values if p[1] < alpha] # Keep only those where P-value is lower than significance level
    significant = sorted(significant, key=lambda x: x[0]) # Sort by the differencing order
    diff_order = significant[0][0] # Get the differencing order
    stationary_series = data.diff(diff_order).fillna(data.mean()) # Make the time series stationary
    
    return {
        'differencing_order': diff_order,
        'time_series': np.array(stationary_series)
    }

In [None]:
def test_stationarity(x):
    #Determing rolling statistics
    rolmean = x.rolling(window=22,center=False).mean()

    rolstd = x.rolling(window=12,center=False).std()
    
    #Plot rolling statistics:
    orig = plt.plot(x, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey Fuller test    
    result=adfuller(x)
    print('ADF Stastistic: %f'%result[0])
    print('p-value: %f'%result[1])
    pvalue=result[1]
    for key,value in result[4].items():
         if result[0]>value:
            print("The graph is non stationery")
            break
         else:
            print("The graph is stationery")
            break;
    print('Critical values:')
    for key,value in result[4].items():
        print('\t%s: %.3f ' % (key, value))   

In [None]:
# ps = {}
differenced_df = {}
for i in df.columns:
    ts = make_stationary(df[i])
    differenced_df[i] = ts["time_series"]
    print(i, adfuller(df[i])[1])
diff = pd.DataFrame(differenced_df)

for i in diff.columns:  
    print(i)
    test_stationarity(diff[i])

### Model sampling 
1. LSTM
2. ARIMA

In [None]:
# LSTM
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

#creating dataframe
data = df.sort_index(ascending=True, axis=0)
# new_data = pd.DataFrame(index=range(0,len(df)),columns=['Date', 'Close'])
# for i in range(0,len(data)):
#     new_data['Date'][i] = data.index[i]
#     new_data['Close'][i] = data['close'][i]

# #setting index
# new_data.index = new_data.Date
# new_data.drop('Date', axis=1, inplace=True)

#creating train and test sets
dataset = data.values

train = dataset[0:987,:]
valid = dataset[987:,:]

#converting dataset into x_train and y_train
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(dataset)

x_train, y_train = [], []
for i in range(60,len(train)):
    x_train.append(scaled_data[i-60:i,0])
    y_train.append(scaled_data[i,0])
x_train, y_train = np.array(x_train), np.array(y_train)

x_train = np.reshape(x_train, (x_train.shape[0],x_train.shape[1],1))

# create and fit the LSTM network
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1],1)))
model.add(LSTM(units=50))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, epochs=1, batch_size=1, verbose=2)

#predicting 246 values, using past 60 from the train data
inputs = new_data[len(new_data) - len(valid) - 60:].values
inputs = inputs.reshape(-1,1)
inputs  = scaler.transform(inputs)

X_test = []
for i in range(60,inputs.shape[0]):
    X_test.append(inputs[i-60:i,0])
X_test = np.array(X_test)

X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1],1))
closing_price = model.predict(X_test)
closing_price = scaler.inverse_transform(closing_price)

rms=np.sqrt(np.mean(np.power((valid-closing_price),2)))
print(rms)
#9.185391255263202

#for plotting
train = new_data[:987]
valid = new_data[987:]
valid['Predictions'] = closing_price
plt.plot(train['Close'])
plt.plot(valid[['Close','Predictions']])

In [None]:
# ARIMA
from statsmodels.tsa.arima_model import ARIMA
from pandas.tseries.offsets import DateOffset
import statsmodels.api as sm

model = ARIMA(df['close'], order=(0,0,1))
model_fit=model.fit()
model_fit.summary()

df['forecast']= model_fit.predict(start=90,end=103,dynamic=True)
df[['close','forecast']].plot(figsize=(12,8), title='ARIMA')

In [None]:
# SARIMAX
model = sm.tsa.statespace.SARIMAX(df['close'],order=(1, 1, 1), seasonal_order=(1,1,1,12))
results = model.fit()

df['forecast'] = results.predict(start=90,end=103,dynamic=True)
df[['close','forecast']].plot(figsize=(12,8), title='SARIMAX')

In [None]:

# Future prediction
future_dates=[df.index[-1] + DateOffset(days=x) for x in range(0,12)]
print(future_dates)

future_datest_df=pd.DataFrame(index=future_dates[1:],columns=df.columns)
future_datest_df.tail()

future_df = pd.concat([df,future_datest_df])
future_df['forecast'] = results.predict()

future_df[['close', 'forecast']].plot(figsize=(12, 8))

# TEST

In [None]:
# Plot
# data = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/a10.csv', parse_dates=['date'], index_col='date')

fig, axes = plt.subplots(2, 1, figsize=(12,12), dpi=100, sharex=True)

# Usual Differencing
axes[0].plot(df[:], label='Original Series')
axes[0].plot(df[:].diff(1), label='Usual Differencing')
axes[0].set_title('Usual Differencing')
axes[0].legend(loc='upper left', fontsize=10)


# Seasinal Dei
axes[1].plot(df[:], label='Original Series')
axes[1].plot(df[:].diff(12), label='Seasonal Differencing', color='green')
axes[1].set_title('Seasonal Differencing')
plt.legend(loc='upper left', fontsize=10)
# plt.suptitle('', fontsize=16)
plt.show()

In [None]:
# Compute Seasonal Index
from statsmodels.tsa.seasonal import seasonal_decompose
from dateutil.parser import parse

# multiplicative seasonal component
result_mul = seasonal_decompose(df['close'],   # 3 years
                                model='multiplicative', 
                                extrapolate_trend='freq')

seasonal_index = result_mul.seasonal[-12].to_frame()
seasonal_index['month'] = pd.to_datetime(seasonal_index.index).month

# merge with the base data
df['month'] = df.index.month
dfx = pd.merge(data, seasonal_index, how='left', on='month')
dfx.columns = ['value', 'month', 'seasonal_index']
# dfx.index = df.index  # reassign the index.
dfx

In [None]:
import pmdarima as pm

# SARIMAX Model
sxmodel = pm.auto_arima(dfx[['value']], exogenous=dfx[['seasonal_index']],
                           start_p=1, start_q=1,
                           test='adf',
                           max_p=3, max_q=3, m=12,
                           start_P=0, seasonal=True,
                           d=None, D=1, trace=True,
                           error_action='ignore',  
                           suppress_warnings=True, 
                           stepwise=True)

sxmodel.summary()

In [None]:
# Forecast
n_periods = 24
fitted, confint = sxmodel.predict(n_periods=n_periods, 
                                  exogenous=np.tile(seasonal_index.seasonal, 2).reshape(-1,1), 
                                  return_conf_int=True)

index_of_fc = pd.date_range(data.index[-1], periods = n_periods, freq='MS')

# make series for plotting purpose
fitted_series = pd.Series(fitted, index=index_of_fc)
lower_series = pd.Series(confint[:, 0], index=index_of_fc)
upper_series = pd.Series(confint[:, 1], index=index_of_fc)

# Plot
plt.plot(data['value'])
plt.plot(fitted_series, color='darkgreen')
plt.fill_between(lower_series.index, 
                 lower_series, 
                 upper_series, 
                 color='k', alpha=.15)

plt.title("SARIMAX Forecast of a10 - Drug Sales")
plt.show()