In [1]:
from sklearn.model_selection import train_test_split  # for the initial split to a train set and a untouched test set
from sklearn.model_selection import TimeSeriesSplit  # for roll forward cross vallidation
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingClassifier
from sklearn import metrics
from sklearn import tree
from sklearn.svm import SVR

import pandas as pd
import numpy as np
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/btc_df_corrVariables.csv', index_col='Date')

In [3]:
df = df.drop(columns=['Unnamed: 0'])

In [4]:
df.head()

Unnamed: 0_level_0,close,volume,ema_short,ema_long,atr,obv,tweet_sentiment,close_nextday
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-09-02,10340.0,44740.25,10164.518939,10452.265343,530.693553,225053.863244,-1.0,10615.28
2019-09-03,10615.28,47998.38,10207.448563,10458.658074,528.572585,273052.240025,0.5,10567.02
2019-09-04,10567.02,43943.89,10241.693462,10462.907561,521.468114,229108.350999,0.5,10564.49
2019-09-05,10564.49,33970.96,10272.43599,10466.891187,516.363249,195137.39036,0.5,10298.73
2019-09-06,10298.73,58799.64,10274.940181,10460.29663,533.470874,136337.749401,0.0,10455.88


In [5]:
x1 = df[['close']]
y1 = df[['close_nextday']]
x1 = sm.add_constant(x1)

x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.25, random_state=123)

lr1 = linear_model.LinearRegression(normalize=True)
lr1 = lr1.fit(x1_train, y1_train)

lr1_prediction = lr1.predict(x1)
lr1_confidence = lr1.score(x1_test, y1_test)

print(lr1_prediction[:5])
print("\nlr1 confidence: ", lr1_confidence)

[[10363.12120161]
 [10641.60420776]
 [10592.78268104]
 [10590.22324335]
 [10321.37100667]]

lr1 confidence:  0.9652390352545006


In [6]:
x2 = df[['volume']]
y2 = df[['close_nextday']]
x2 = sm.add_constant(x2)

x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.25, random_state=123)

lr2 = linear_model.LinearRegression(normalize=True)
lr2 = lr2.fit(x2_train, y2_train)

lr2_prediction = lr2.predict(x2)
lr2_confidence = lr2.score(x2_test, y2_test)

print(lr2_prediction[:5])
print("lr2 confidence: ", lr2_confidence)

[[9533.64399058]
 [9507.7895581 ]
 [9539.96339338]
 [9619.10217707]
 [9422.07767816]]
lr2 confidence:  0.019148109127178015


In [7]:
x3 = df[['ema_short']]
y3 = df[['close_nextday']]
x3 = sm.add_constant(x3)

x3_train, x3_test, y3_train, y3_test = train_test_split(x3, y3, test_size=0.25, random_state=123)

lr3 = linear_model.LinearRegression(normalize=True)
lr3 = lr3.fit(x3_train, y3_train)

lr3_prediction = lr3.predict(x3)
lr3_confidence = lr3.score(x3_test, y3_test)

print(lr3_prediction[:5])
print("lr3 confidence: ", lr3_confidence)

[[10434.34225379]
 [10484.05427872]
 [10523.70948799]
 [10559.30899049]
 [10562.20881617]]
lr3 confidence:  0.8042583851769596


In [8]:
x4 = df[['ema_long']]
y4 = df[['close_nextday']]
x4 = sm.add_constant(x4)

x4_train, x4_test, y4_train, y4_test = train_test_split(x4, y4, test_size=0.25, random_state=123)

lr4 = linear_model.LinearRegression(normalize=True)
lr4 = lr4.fit(x4_train, y4_train)

lr4_prediction = lr4.predict(x4)
lr4_confidence = lr4.score(x4_test, y4_test)

print(lr4_prediction[:5])
print("lr4 confidence: ", lr4_confidence)

[[11052.34059298]
 [11060.73211023]
 [11066.31026488]
 [11071.53943138]
 [11062.88298588]]
lr4 confidence:  0.6178904472037611


In [9]:
x5 = df[['atr']]
y5 = df[['close_nextday']]
x5 = sm.add_constant(x5)

x5_train, x5_test, y5_train, y5_test = train_test_split(x5, y5, test_size=0.25, random_state=123)

lr5 = linear_model.LinearRegression(normalize=True)
lr5 = lr5.fit(x5_train, y5_train)

lr5_prediction = lr5.predict(x5)
lr5_confidence = lr5.score(x5_test, y5_test)

print(lr5_prediction[:5])
print("lr5 confidence: ", lr5_confidence)

[[9505.28883388]
 [9503.15910571]
 [9496.02529222]
 [9490.8993426 ]
 [9508.07762621]]
lr5 confidence:  -0.010333207187534166


In [10]:
x6 = df[['obv']]
y6 = df[['close_nextday']]
x6 = sm.add_constant(x6)
x6_train, x6_test, y6_train, y6_test = train_test_split(x6, y6, test_size=0.25, random_state=123)

lr6 = linear_model.LinearRegression(normalize=True)
lr6 = lr6.fit(x6_train, y6_train)

lr6_prediction = lr6.predict(x6)
lr6_confidence = lr6.score(x6_test, y6_test)

print(lr6_prediction[:5])
print("lr6 confidence: ", lr6_confidence)

[[8873.27405386]
 [8976.60497842]
 [8882.0025567 ]
 [8808.86985917]
 [8682.28596881]]
lr6 confidence:  0.5881596894990473


In [11]:
x7 = df[['tweet_sentiment']]
y7 = df[['close_nextday']]
x7 = sm.add_constant(x7)

x7_train, x7_test, y7_train, y7_test = train_test_split(x7, y7, test_size=0.25, random_state=123)

lr7 = linear_model.LinearRegression(normalize=True)
lr7 = lr7.fit(x7_train, y7_train)

lr7_prediction = lr7.predict(x7)
lr7_confidence = lr7.score(x7_test, y7_test)

print(lr7_prediction[:5])
print("lr7 confidence: ", lr7_confidence)

[[9317.14355785]
 [9418.52487761]
 [9418.52487761]
 [9418.52487761]
 [9384.73110435]]
lr7 confidence:  -0.00500831592492279


In [12]:
X = df[['close']]
y = df[['close_nextday']]
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.85)

In [13]:
vot_clf = VotingClassifier(estimators=[('lr1', lr1),
                                       ('lr3', lr3, 'lr4', lr4, 'lr6', lr6,
                                        'lr7', lr7)],
                           voting='hard')

In [15]:
from sklearn import preprocessing
from sklearn import utils
lab_enc = preprocessing.LabelEncoder()
y_train = lab_enc.fit_transform(y_train)
y_train = y_train.reshape(-1,1)
print(y_train)

[[278]
 [107]
 [157]
 [300]
 [249]
 [101]
 [  5]
 [ 59]
 [319]
 [254]
 [100]
 [217]
 [117]
 [ 89]
 [359]
 [225]
 [367]
 [168]
 [173]
 [358]
 [103]
 [263]
 [140]
 [330]
 [142]
 [ 99]
 [ 41]
 [187]
 [202]
 [ 77]
 [ 25]
 [ 29]
 [271]
 [ 66]
 [204]
 [335]
 [282]
 [ 88]
 [ 30]
 [  7]
 [306]
 [234]
 [269]
 [324]
 [262]
 [268]
 [272]
 [255]
 [137]
 [195]
 [ 37]
 [175]
 [200]
 [ 73]
 [371]
 [ 13]
 [115]
 [231]
 [183]
 [229]
 [ 94]
 [257]
 [293]
 [ 42]
 [284]
 [326]
 [ 51]
 [ 48]
 [364]
 [356]
 [305]
 [313]
 [198]
 [150]
 [301]
 [130]
 [221]
 [ 15]
 [  1]
 [233]
 [ 39]
 [354]
 [126]
 [285]
 [298]
 [143]
 [ 35]
 [365]
 [303]
 [119]
 [345]
 [292]
 [ 44]
 [336]
 [176]
 [  2]
 [341]
 [246]
 [316]
 [122]
 [342]
 [189]
 [124]
 [128]
 [210]
 [310]
 [294]
 [151]
 [186]
 [ 61]
 [  9]
 [ 17]
 [274]
 [172]
 [163]
 [251]
 [205]
 [148]
 [162]
 [279]
 [317]
 [ 36]
 [ 60]
 [357]
 [353]
 [241]
 [355]
 [194]
 [369]
 [ 20]
 [ 12]
 [  3]
 [ 67]
 [ 74]
 [267]
 [346]
 [206]
 [289]
 [ 43]
 [136]
 [160]
 [ 65]
 [224]

In [17]:
vot_clf.fit(x_train, y_train)

ValueError: The estimator LinearRegression should be a classifier.

In [None]:
pred = vot_clf.predict(x_test)
accuracy_score(y_test, pred)

In [None]:
lr_prediction = lr.predict(df1)
print(lr_prediction)

In [None]:
x2 = df.drop(['close_nextday','close'],axis=1)
y2 = df[['close_nextday']]

# Create constants for X, so the model knows its bounds
x2 = sm.add_constant(x2)

# Split the data
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.25, random_state=123)

In [None]:
rModel2 = sm.OLS(y2_train, x2_train)

# Fit the model
results2 = rModel2.fit()

In [None]:
results2.summary()

In [None]:
y2_pred = results2.predict(x2_test)

In [None]:
# Plot the predictions
# Build a scatterplot
_ = plt.scatter(y2_test,y2_pred)

# Add a line for perfect correlation
_ = plt.plot([x for x in range(5000,16000)], [y for y in range(5000,16000)], color='orange')

# Label it nicely
_ = plt.title('Model 3 Prediction vs Actual')
_ = plt.xlabel('actual values')
_ = plt.ylabel('predicted values')

In [None]:
def rmse(predictions, targets):
    return np.sqrt(((predictions-targets)**2).mean())

In [None]:
# Get predictions from rModel3
y2b_pred = results2.predict(x2_test)

# Put the predictions & actual values into a dataframe
df03 = pd.DataFrame(y2_test)
df03.rename(columns={'close_nextday':'actual'}, inplace=True)
df03['predicted'] = y2b_pred

rmse(df03.actual, df03.predicted)