In [None]:
import pandas as pd
from matplotlib import pyplot

In [None]:
df = pd.read_csv("../complete-merged-df.csv", index_col=0, parse_dates=True)
df.head()

In [None]:
len(df.columns), df.columns

In [None]:
print("NaN values: ",len(df[df.isna().any(axis=1)]))

In [None]:
fig, axes = plt.subplots(nrows=5, ncols=3, dpi=300, figsize=(11, 6))

for i, ax in enumerate(axes.flatten()[:15]):
    data = df[df.columns[i]]

    ax.plot(data, color='black', linewidth=1)
    ax.set_title(df.columns[i])
    ax.xaxis.set_ticks_position('none')
    ax.yaxis.set_ticks_position('none')
    ax.spines['top'].set_alpha(0)
    ax.tick_params(labelsize=6)

fig.autofmt_xdate()
plt.tight_layout()


# Feature selection

In [None]:
from scipy.stats import linregress
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

def correlation_test(v1,v2, graph=False):
    coef = np.corrcoef(v1, v2)
    linreg = linregress(v1, v2)
    print("Coef : ", coef)
    print("Linear Regression results : ")
    print(f"\tp = {round(linreg.pvalue,5)}")
    print(f"\tslope = {round(linreg.slope,5)}")
    print(f"\tintercept = {round(linreg.intercept,5)}")
    print(f"\tstd. err = {round(linreg.stderr, 5)}")
    print("-----------------------------")
    if graph:
        plt.scatter(v1, v2)
        plt.show()

# correlation to btc closing price and miner revenue
def corr_plot(y_label, y_data, y2_label, y2_data, x):
    fig, ax1= plt.subplots()
    fig.set_figwidth(18)
    fig.set_figheight(10)

    ax1.set_xlabel('Date')
    ax1.tick_params(axis='x', bottom=False, labelbottom=False)

    ax1.set_ylabel(y_label, color='red', size='x-large')
    ax1.tick_params(axis='y', labelcolor='red', labelsize='large')
    ax1.plot(x, y_data, color='red')

    axprecip = ax1.twinx()
    axprecip.set_ylabel(y2_label, color='blue', size='x-large')
    axprecip.tick_params(axis='y', labelcolor='blue', labelsize='large')
    axprecip.plot(x, y2_data, color='blue')

corr_plot(
    "Bitcoin Price", df["close"],
    "SVI", df["SVI"],
    df.index)

correlation_test(df['n-transactions'],  df['SVI'])
correlation_test(df['close'],  df['cost-per-transaction'])
correlation_test(df['close'],  df['hash-rate'])
correlation_test(df['close'],  df['n-transactions'])
correlation_test(df['close'],  df['Gold price'])

In [None]:
# Close price correlation matrix
k = len(df.columns)  # number of variables for heatmap
cols = df.corr().nlargest(k, 'close')['close'].index
cm = df[cols].corr()
cm.style.background_gradient(cmap='coolwarm').set_precision(2)


In [None]:
array = df.values
X = array[:, 1:15]
Y = array[:, 0].astype('int')

# print(f"X at (0): {X[0]}")
# print(f"Y at (0): {Y[0]}")
X.shape, Y.shape


In [None]:
# CHI SQUARE TEST
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import numpy as np

# Feature extraction
test = SelectKBest(score_func=chi2, k=9)
chifit = test.fit(X, Y)

# Summarize scores
np.set_printoptions(precision=3)
print(list(zip(df.columns[1:], chifit.scores_)))

features = chifit.transform(X)
# Summarize selected features
# print(features[0:5, :])
len(chifit.scores_)

In [None]:
# RFE - Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Feature extraction
model = LinearRegression()
rfe = RFE(model, n_features_to_select=10, step=1)

fit = rfe.fit(X, Y)

print("Num Features: %s" % (fit.n_features_))
# print("Selected Features: %s" % (fit.support_))
# print("Feature Ranking: %s" % (fit.ranking_))

features = pd.DataFrame(list(zip(df.columns[1:], fit.ranking_, fit.support_,  chifit.scores_.astype(int))), 
columns=['Features', 'Ranking', 'RFE-Support','Chi-Support'])
features['RFE-Support'] = features['RFE-Support'].apply(lambda x: 'Selected' if x else 'Not Selected')


In [None]:
features.sort_values(by=['Ranking'], ascending=True)[:10]["Features"].to_list()

In [None]:
# plot features by rank
sns.barplot(x='Ranking', y='Features', data=features.sort_values(by=['Ranking']))

In [None]:
print("Features: %s" % (len(df.columns)-1))
sns.catplot(x='RFE-Support', y='Features', data=features.sort_values(by=['RFE-Support']))

In [None]:
# KFold Gridsearch for the best number of params for a Regression model

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# step-1: create a cross-validation scheme
folds = KFold(n_splits=10, shuffle=True, random_state=1)

# step-2: specify range of hyperparameters to tune
hyper_params = [{'n_features_to_select': list(range(2, 15))}]


# step-3: perform grid search
# 3.1 specify model
lm = LinearRegression()
lm.fit(X, Y)
rfe = RFE(lm)

# 3.2 call GridSearchCV()
model_cv = GridSearchCV(estimator=rfe,
                        param_grid=hyper_params,
                        scoring='r2',
                        cv=folds,
                        verbose=1,
                        return_train_score=True)
model_cv.fit(X, Y)

# cv results
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results[["param_n_features_to_select", "mean_test_score", "std_test_score",
            "mean_train_score", "std_train_score"]].sort_values(by=["mean_train_score", "param_n_features_to_select"], ascending=False)


In [None]:
# plotting cv results
pyplot.figure(figsize=(16, 6))

pyplot.plot(cv_results["param_n_features_to_select"],
         cv_results["mean_test_score"])
pyplot.plot(cv_results["param_n_features_to_select"],
         cv_results["mean_train_score"])
pyplot.xlabel('number of features')
pyplot.ylabel('r-squared')
pyplot.title("Optimal Number of Features")
pyplot.legend(['Test score', 'Train score'], loc='upper left')


In [None]:
# explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from matplotlib import pyplot

# get a list of models to evaluate
def get_models():
	models = dict()

	for i in range(2, 15):
		model = LinearRegression()
		rfe = RFE(estimator=LinearRegression(), n_features_to_select=i)
		models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
	return models
 
# get the models to evaluate
models = get_models()

# evaluate the models and store results
folds = KFold(n_splits=10, shuffle=True, random_state=1)

results, names = list(), list()
for name, model in models.items():
	scores = cross_val_score(model, X, Y, scoring='r2', cv=folds)
	
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))


In [None]:
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.xlabel('Number of Features')
pyplot.ylabel('R-squared')
pyplot.show()


In [None]:
results = [mean(x) for x in results] # calculate mean for each model score
scores = pd.DataFrame({"r2_score": results, "n_features": names}) # store results in dataframe
scores.sort_values(by="r2_score", ascending=False)[0:3] # top 3 scores 

In [None]:
# AIC
from sklearn.linear_model import LinearRegression
from statsmodels.regression.linear_model import OLS
import statsmodels.api as sm

y = df['close']  # define response variable

# 
predictors = df[features['Features'][:10].values]
X = sm.add_constant(predictors)  # add constant to predictor variables

#fit regression model
model = sm.OLS(y, X).fit()
model.summary()  # view AIC of model


In [None]:
print("AIC : ",model.aic)
print("R2 : ",model.rsquared)
print("Selected params : ",len(model.params.keys())-1)
aic_pvalues = pd.DataFrame(model.pvalues, columns=['p-values']).drop(['const'])
aic_pvalues['h0'] = aic_pvalues['p-values'].apply(lambda x: 'Rejected' if x < 0.05 else 'Not Rejected')
aic_pvalues["p-values"] = aic_pvalues["p-values"].apply(lambda x: '%.3f' % x)
aic_pvalues.sort_values(by='h0', ascending=False)

# Unit Root Testing

In [None]:
from statsmodels.tsa.stattools import adfuller

def make_stationary(data: pd.Series, alpha: float = 0.05, max_diff_order: int = 10) -> dict:
    # Test to see if the time series is already stationary
    if adfuller(data)[1] < alpha:
        return {
            'differencing_order': 0,
            'time_series': np.array(data)
        }

    p_values = [] # A list to store P-Values
    # Test for differencing orders from 1 to max_diff_order (included)
    for i in range(1, max_diff_order + 1):
        result = adfuller(data.diff(i).fillna(data.mean())) # Perform ADF test
        p_values.append((i, result[1])) # Append P-value
        
    significant = [p for p in p_values if p[1] < alpha] # Keep only those where P-value is lower than significance level
    significant = sorted(significant, key=lambda x: x[0]) # Sort by the differencing order
    diff_order = significant[0][0] # Get the differencing order
    stationary_series = data.diff(diff_order).fillna(data.mean()) # Make the time series stationary
    
    return {
        'differencing_order': diff_order,
        'time_series': np.array(stationary_series)
    }

In [None]:
def test_stationarity(x, ax, name):
    #Determing rolling statistics
    rolmean = x.rolling(window=22,center=False).mean()

    rolstd = x.rolling(window=12,center=False).std()
    
    #Plot rolling statistics:
    ax.plot(x, color='blue',label='Original')
    ax.plot(rolmean, color='red', label='Rolling Mean')
    ax.plot(rolstd, color='black', label = 'Rolling Std')
    ax.set_title(name)

    #Perform Dickey Fuller test    
    result=adfuller(x)

    print('ADF Stastistic: %f'%result[0])
    print('p-value: %f'%result[1])

    for key,value in result[4].items():
         if result[0]>value:
            print("The graph is non stationery")
            break
         else:
            print("The graph is stationery")
            break;

   #  print('Critical values:')
   #  for key,value in result[4].items():
      #   print('\t%s: %.3f ' % (key, value))   

In [None]:
# ps = {}
differenced_df = {}
for i in df.columns:
    ts = make_stationary(df[i])
    differenced_df[i] = ts["time_series"]
    print(i, adfuller(df[i])[1])
diff = pd.DataFrame(differenced_df)

# for i in diff.columns:  
    # print(i)

In [None]:
# try:
#     diff.reset_index(inplace=True)
#     diff.drop(['index'], axis=1, inplace=True)
#     diff["Date"] = df.index
#     diff.set_index(diff.Date, inplace=True)
# except ValueError as e:
#     print("Row indexing issue: ", e)


# diff
# diff.index = pd.to_datetime(diff.index)
diff.to_csv('differenced_df.csv')

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)

test_stationarity(diff.close, ax=ax1, name="Close")
test_stationarity(diff.open, ax=ax2, name="Open")
test_stationarity(diff[diff.columns[7]], ax=ax3, name=diff.columns[7])
test_stationarity(diff.SVI, ax=ax4, name="SVI")

for ax in fig.get_axes():
    ax.label_outer()


### Model sampling 
1. LSTM
2. ARIMA

In [None]:
# LSTM
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

#creating dataframe
data = diff.sort_index(ascending=True, axis=0)

new_data = pd.DataFrame(index=range(0,len(diff)),columns=['Date', 'Close'])
for i in range(0,len(data)):
    new_data['Date'][i] = data.index[i]
    new_data['Close'][i] = data['close'][i]

#setting index
new_data.index = new_data.Date
new_data.drop('Date', axis=1, inplace=True)

#creating train and test sets
dataset = data.values

n_train_days = 365 * 4
train = dataset[0:n_train_days,:]
valid = dataset[n_train_days:, :]

#converting dataset into x_train and y_train
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(dataset)

x_train, y_train = [], []
for i in range(60,len(train)):
    x_train.append(scaled_data[i-60:i,0])
    y_train.append(scaled_data[i,0])
x_train, y_train = np.array(x_train), np.array(y_train)

x_train = np.reshape(x_train, (x_train.shape[0],x_train.shape[1],1))

# create and fit the LSTM network
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1],1)))
model.add(LSTM(units=50))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, epochs=1, batch_size=1, verbose=2)

#predicting 246 values, using past 60 from the train data
inputs = new_data[len(new_data) - len(valid) - 60:].values
inputs = inputs.reshape(-1,1)
inputs  = scaler.transform(inputs)

X_test = []
for i in range(60,inputs.shape[0]):
    X_test.append(inputs[i-60:i,0])
X_test = np.array(X_test)

X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1],1))
closing_price = model.predict(X_test)
closing_price = scaler.inverse_transform(closing_price)

rms=np.sqrt(np.mean(np.power((valid-closing_price),2)))
print(rms)
#9.185391255263202

#for plotting
train = new_data[:987]
valid = new_data[987:]
valid['Predictions'] = closing_price
plt.plot(train['Close'])
plt.plot(valid[['Close','Predictions']])


In [None]:
# ARIMA
from statsmodels.tsa.arima.model import ARIMA
from pandas.tseries.offsets import DateOffset
import statsmodels.api as sm

model = ARIMA(diff['close'], order=(1,1,2))
model_fit=model.fit()
model_fit.summary()

pred = model_fit.predict(start=0, end=len(diff.close)+60)
diff.close[1760:].plot(color='blue', label='Actual')
pred[1760:].plot(color='red', label='Predicted')


In [None]:
# Future prediction
future_dates = [df.index[-1] + DateOffset(days=x) for x in range(0, 12)]

future_datest_df = pd.DataFrame(index=future_dates[1:], columns=df.columns)
future_datest_df.tail()

future_df = pd.concat([diff, future_datest_df])
future_df['forecast'] = model_fit.predict(start=1700, end=1900, dynamic=False)
future_df[['close', 'forecast']][1700:].plot(figsize=(12, 8))