In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json

In [2]:
# function courtesy of CryptoDataDownload.com
def fetch_daily_data(symbol):
    pair_split = symbol.split('/')  # symbol must be in format XXX/XXX ie. BTC/EUR
    symbol = pair_split[0] + '-' + pair_split[1]
    url = f'https://api.pro.coinbase.com/products/{symbol}/candles?granularity=86400'
    response = requests.get(url)
    if response.status_code == 200:  # check to make sure the response from server is good
        data = pd.DataFrame(json.loads(response.text), columns=['unix', 'low', 'high', 'open', 'close', 'volume'])
        data['date'] = pd.to_datetime(data['unix'], unit='s')  # convert to a readable date
        data['vol_fiat'] = data['volume'] * data['close']      # multiply the BTC volume by closing price to approximate fiat volume
                            
        # if we failed to get any data, print an error...otherwise write the file
        if data is None:
            print("Did not return any data from Coinbase for this symbol")
        else:
            data.to_csv(f'cb_{pair_split[0] + pair_split[1]}_daily.csv', index=False)
    else:
        print("Did not receieve OK response from Coinbase API")

## Pulling Tezos
Trying Tezos next.

In [3]:
# fetch_daily_data('XTZ/USD')

In [4]:
df = pd.read_csv('cb_XTZUSD_daily.csv', index_col='date', parse_dates=[0])

In [5]:
df.head()

Unnamed: 0_level_0,unix,low,high,open,close,volume,vol_fiat
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-20,1611100800,2.6444,2.9496,2.8723,2.8772,4913515.79,14137170.0
2021-01-19,1611014400,2.8724,3.22,3.0346,2.8731,6922518.64,19889090.0
2021-01-18,1610928000,2.845,3.1175,2.9726,3.0342,5120896.85,15537830.0
2021-01-17,1610841600,2.6851,3.1136,2.853,2.9738,6827360.01,20303200.0
2021-01-16,1610755200,2.6904,3.1882,2.7064,2.854,13772368.03,39306340.0


In [6]:
df.drop('unix', axis=1, inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 2021-01-20 to 2020-03-27
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   low       300 non-null    float64
 1   high      300 non-null    float64
 2   open      300 non-null    float64
 3   close     300 non-null    float64
 4   volume    300 non-null    float64
 5   vol_fiat  300 non-null    float64
dtypes: float64(6)
memory usage: 16.4+ KB


In [8]:
df.index = pd.to_datetime(df.index)

In [9]:
df.sort_values(by='date', axis=0, inplace=True)

In [10]:
df.head()

Unnamed: 0_level_0,low,high,open,close,volume,vol_fiat
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-27,1.5713,1.7897,1.7645,1.5897,3969087.96,6309659.0
2020-03-28,1.5052,1.6226,1.5896,1.5655,3872624.59,6062594.0
2020-03-29,1.4664,1.5804,1.5693,1.4698,2250825.26,3308263.0
2020-03-30,1.461,1.665,1.4675,1.6077,2626418.2,4222493.0
2020-03-31,1.5532,1.6424,1.6081,1.611,1892308.96,3048510.0


# Feature Engineering
Here we are designing our features to better predict the price.

In [11]:
def make_features(data, max_lag):
    data['month'] = data.index.month
    data['dayofweek'] = data.index.dayofweek
    data['week_low_mean'] = data['low'].shift().rolling(7).mean()
    data['week_high_mean'] = data['high'].shift().rolling(7).mean()
    data['bi_week_low_mean'] = data['low'].shift().rolling(14).mean()
    data['bi_week_high_mean'] = data['high'].shift().rolling(14).mean()
    data['month_low_mean'] = data['low'].shift().rolling(28).mean()
    data['month_high_mean'] = data['high'].shift().rolling(28).mean()
    for lag in range(1, max_lag + 1):
        data['high_lag_{}'.format(lag)] = data['high'].shift(lag)
    for lag in range(1, max_lag + 1):
        data['low_lag_{}'.format(lag)] = data['low'].shift(lag)

In [12]:
make_features(df, 11)

In [13]:
df.tail()

Unnamed: 0_level_0,low,high,open,close,volume,vol_fiat,month,dayofweek,week_low_mean,week_high_mean,...,low_lag_2,low_lag_3,low_lag_4,low_lag_5,low_lag_6,low_lag_7,low_lag_8,low_lag_9,low_lag_10,low_lag_11
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-16,2.6904,3.1882,2.7064,2.854,13772368.03,39306340.0,1,5,2.306043,2.7003,...,2.4061,2.2276,2.1947,2.0583,2.3918,2.418,2.25,2.405,2.2972,2.1361
2021-01-17,2.6851,3.1136,2.853,2.9738,6827360.01,20303200.0,1,6,2.344957,2.770271,...,2.4458,2.4061,2.2276,2.1947,2.0583,2.3918,2.418,2.25,2.405,2.2972
2021-01-18,2.845,3.1175,2.9726,3.0342,5120896.85,15537830.0,1,0,2.386857,2.793486,...,2.6904,2.4458,2.4061,2.2276,2.1947,2.0583,2.3918,2.418,2.25,2.405
2021-01-19,2.8724,3.22,3.0346,2.8731,6922518.64,19889090.0,1,1,2.499243,2.843014,...,2.6851,2.6904,2.4458,2.4061,2.2276,2.1947,2.0583,2.3918,2.418,2.25
2021-01-20,2.6444,2.9496,2.8723,2.8772,4913515.79,14137170.0,1,2,2.596057,2.952714,...,2.845,2.6851,2.6904,2.4458,2.4061,2.2276,2.1947,2.0583,2.3918,2.418


In [14]:
df.columns

Index(['low', 'high', 'open', 'close', 'volume', 'vol_fiat', 'month',
       'dayofweek', 'week_low_mean', 'week_high_mean', 'bi_week_low_mean',
       'bi_week_high_mean', 'month_low_mean', 'month_high_mean', 'high_lag_1',
       'high_lag_2', 'high_lag_3', 'high_lag_4', 'high_lag_5', 'high_lag_6',
       'high_lag_7', 'high_lag_8', 'high_lag_9', 'high_lag_10', 'high_lag_11',
       'low_lag_1', 'low_lag_2', 'low_lag_3', 'low_lag_4', 'low_lag_5',
       'low_lag_6', 'low_lag_7', 'low_lag_8', 'low_lag_9', 'low_lag_10',
       'low_lag_11'],
      dtype='object')

In [15]:
# Since we are trying to predict, many columns in here are not necessary and could impede our model
df.drop(['vol_fiat', 'volume', 'close', 'open'], axis=1, inplace=True)

In [16]:
df.columns

Index(['low', 'high', 'month', 'dayofweek', 'week_low_mean', 'week_high_mean',
       'bi_week_low_mean', 'bi_week_high_mean', 'month_low_mean',
       'month_high_mean', 'high_lag_1', 'high_lag_2', 'high_lag_3',
       'high_lag_4', 'high_lag_5', 'high_lag_6', 'high_lag_7', 'high_lag_8',
       'high_lag_9', 'high_lag_10', 'high_lag_11', 'low_lag_1', 'low_lag_2',
       'low_lag_3', 'low_lag_4', 'low_lag_5', 'low_lag_6', 'low_lag_7',
       'low_lag_8', 'low_lag_9', 'low_lag_10', 'low_lag_11'],
      dtype='object')

# Preparing Data for Model

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [18]:
train, test = train_test_split(df, shuffle=False, test_size=0.1)
train = train.dropna()

In [19]:
X_train = train.drop(['low', 'high'], axis=1)
X_test = test.drop(['low', 'high'], axis=1)
y_train = train[['low', 'high']]
y_test = test[['low', 'high']]

In [20]:
model = RandomForestRegressor(random_state=47)

In [21]:
model.fit(X_train, y_train)
predict = model.predict(X_test)

In [22]:
rmse = np.sqrt(mean_squared_error(y_test, predict))
rmse

0.15095086944637556

In [23]:
predictions = pd.DataFrame(predict, columns=['low', 'high'], index=y_test.index)
compare = pd.concat([predictions, y_test], axis=1)
compare.columns = ['pred_low', 'pred_high', 'real_low', 'real_high']
compare

Unnamed: 0_level_0,pred_low,pred_high,real_low,real_high
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-12-22,2.126595,2.271106,2.0778,2.1952
2020-12-23,2.086719,2.190118,1.6,2.2013
2020-12-24,1.977674,2.151569,1.7943,2.0844
2020-12-25,1.899219,2.069569,1.9775,2.1148
2020-12-26,2.002624,2.15341,1.9252,2.048
2020-12-27,1.858683,2.003528,1.879,2.1446
2020-12-28,1.984699,2.168943,1.9831,2.2471
2020-12-29,2.071877,2.219403,1.9777,2.1729
2020-12-30,2.041256,2.185282,1.9591,2.0751
2020-12-31,1.877029,2.028387,1.9397,2.0238


In [24]:
len(compare[compare['real_high'] < compare['pred_high']])

11

In [25]:
len(compare[compare['real_low'] < compare['pred_low']])

16

To better see where model error is occuring, I wanted to look line-by-line through the results. Here we see in many cases the actual low is considerably lower than what is predicted. But, the high is often closer to the predictions. Further, we see that in only about 1/3 of our examples is the actual high lower than the predicted high. 

With this in mind, we should generally assume lower lows and higher highs than our model predicts. Conservatively, we might instead just assume the lower lows (buying multiple times to anticipate the drop) and keep our sale price at the predicted highs. 

# Hyperparamter Tuning
## Sticking with Random Forest

While there are various ways we can approach hyperparameter tuning (and many would argue what I'm about to do is the least efficient way), I want to manually work through this for both curiousity and for a desire to be thorough. Further, I recognize that I should be cross-validating, which I might end up doing. For now however, I'll proceed in this more tedious fashion. 

In [26]:
for estims in range(100, 201, 10):
    model = RandomForestRegressor(random_state=47, n_estimators=estims)
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predict))
    print("n_estimators =", estims, ":", rmse)

n_estimators = 100 : 0.15095086944637556
n_estimators = 110 : 0.1508999204324349
n_estimators = 120 : 0.15184071619925532
n_estimators = 130 : 0.15236691401631144
n_estimators = 140 : 0.15249159333489368
n_estimators = 150 : 0.15258562228818953
n_estimators = 160 : 0.15215360135599448
n_estimators = 170 : 0.15218850745531434
n_estimators = 180 : 0.15199893233010112
n_estimators = 190 : 0.15190257555553882
n_estimators = 200 : 0.1523682073214555


In [27]:
for depth in range(100, 201, 10):
    model = RandomForestRegressor(random_state=47, n_estimators=110, max_depth=depth)
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predict))
    print("max_depth=", depth, ":", rmse)

max_depth= 100 : 0.1508999204324349
max_depth= 110 : 0.1508999204324349
max_depth= 120 : 0.1508999204324349
max_depth= 130 : 0.1508999204324349
max_depth= 140 : 0.1508999204324349
max_depth= 150 : 0.1508999204324349
max_depth= 160 : 0.1508999204324349
max_depth= 170 : 0.1508999204324349
max_depth= 180 : 0.1508999204324349
max_depth= 190 : 0.1508999204324349
max_depth= 200 : 0.1508999204324349


In [28]:
for features in ['auto', 'sqrt', 'log2']:
    model = RandomForestRegressor(random_state=47, n_estimators=110, max_depth=100, max_features=features)
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predict))
    print("feature=", features, ":", rmse)

feature= auto : 0.1508999204324349
feature= sqrt : 0.16971653409565254
feature= log2 : 0.1735729536444533


In [29]:
for splits in range(2, 10, 1):
    model = RandomForestRegressor(random_state=47, n_estimators=110, max_depth=100, max_features='auto', 
                                  min_samples_split=splits)
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predict))
    print("splits =", splits, ":", rmse)

splits = 2 : 0.1508999204324349
splits = 3 : 0.15348220944794952
splits = 4 : 0.15228515181178717
splits = 5 : 0.15260532360789236
splits = 6 : 0.15381121466730432
splits = 7 : 0.15300438613532316
splits = 8 : 0.15322962590737924
splits = 9 : 0.15487240451691872


In [30]:
for leaves in range(1, 10, 1):
    model = RandomForestRegressor(random_state=47, n_estimators=110, max_depth=100, max_features='auto', 
                                  min_samples_split=2, min_samples_leaf=leaves)
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predict))
    print("leaves =", leaves, ":", rmse)

leaves = 1 : 0.1508999204324349
leaves = 2 : 0.15552144375209387
leaves = 3 : 0.15806536610831295
leaves = 4 : 0.15737496032515066
leaves = 5 : 0.15897689964960576
leaves = 6 : 0.1578901435437812
leaves = 7 : 0.15777442574516407
leaves = 8 : 0.15896818620600162
leaves = 9 : 0.15925955623449245


Our best model at this point is a RandomForestRegressor with 110 Estimators, 100 max depth, auto features, min_samples_split of 2 and min_samples_leaf of 1. This gives us ~ 0.1509 RMSE. When running a grid search, I find that I cannot beat this, so I will instead check out other models. 

In [31]:
'''from sklearn.model_selection import GridSearchCV
np.random.seed(47)

model = RandomForestRegressor(n_jobs=-1)

gs_model = GridSearchCV(estimator=model, param_grid=grid, cv=3, verbose=2)

gs_model.fit(X_train, y_train);'''

'from sklearn.model_selection import GridSearchCV\nnp.random.seed(47)\n\nmodel = RandomForestRegressor(n_jobs=-1)\n\ngs_model = GridSearchCV(estimator=model, param_grid=grid, cv=3, verbose=2)\n\ngs_model.fit(X_train, y_train);'

In [32]:
"""gs_model.best_params_
model = RandomForestRegressor(max_depth=None, max_features='auto', min_samples_leaf=5, min_samples_split=3, random_state=47)
model.fit(X_train, y_train)
predict = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict))
rmse
This model achieved a 0.1592 rmse"""

"gs_model.best_params_\nmodel = RandomForestRegressor(max_depth=None, max_features='auto', min_samples_leaf=5, min_samples_split=3, random_state=47)\nmodel.fit(X_train, y_train)\npredict = model.predict(X_test)\nrmse = np.sqrt(mean_squared_error(y_test, predict))\nrmse\nThis model achieved a 0.1592 rmse"

In [33]:
model = RandomForestRegressor(random_state=47, n_estimators=110, max_depth=100, max_features='auto', 
                                  min_samples_split=2)

In [34]:
model.fit(X_train, y_train)
predict = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict))
rmse

0.1508999204324349

In [35]:
predictions = pd.DataFrame(predict, columns=['low', 'high'], index=y_test.index)
compare = pd.concat([predictions, y_test], axis=1)
compare.columns = ['pred_low', 'pred_high', 'real_low', 'real_high']
compare

Unnamed: 0_level_0,pred_low,pred_high,real_low,real_high
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-12-22,2.122663,2.26676,2.0778,2.1952
2020-12-23,2.085475,2.188253,1.6,2.2013
2020-12-24,1.974713,2.149753,1.7943,2.0844
2020-12-25,1.899488,2.067215,1.9775,2.1148
2020-12-26,2.004187,2.15122,1.9252,2.048
2020-12-27,1.858198,2.001672,1.879,2.1446
2020-12-28,1.984289,2.167526,1.9831,2.2471
2020-12-29,2.072323,2.220609,1.9777,2.1729
2020-12-30,2.038568,2.183256,1.9591,2.0751
2020-12-31,1.875364,2.026902,1.9397,2.0238


# Trying Other Models

## MultiTask Lasso

In [36]:
from sklearn.linear_model import MultiTaskLasso
model = MultiTaskLasso(normalize=True, random_state=47, warm_start=True)

In [37]:
model.fit(X_train, y_train)
predict = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict))
rmse

0.44322675047310706

## ElasticNet

In [38]:
from sklearn.linear_model import ElasticNet
model = ElasticNet()

In [39]:
model.fit(X_train, y_train)
predict = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict))
rmse

0.4399830751279508

## Ridge

In [40]:
from sklearn.linear_model import Ridge
model = Ridge()

In [41]:
model.fit(X_train, y_train)
predict = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict))
rmse

0.1708339723438454

In [52]:
predictions = pd.DataFrame(predict, columns=['low', 'high'], index=y_test.index)
compare = pd.concat([predictions, y_test], axis=1)
compare.columns = ['pred_low', 'pred_high', 'real_low', 'real_high']
compare

Unnamed: 0_level_0,pred_low,pred_high,real_low,real_high
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-12-22,2.095589,2.302515,2.0778,2.1952
2020-12-23,2.042425,2.207764,1.6,2.2013
2020-12-24,1.780355,2.019492,1.7943,2.0844
2020-12-25,1.877922,2.087707,1.9775,2.1148
2020-12-26,1.9679,2.148678,1.9252,2.048
2020-12-27,1.941926,2.035605,1.879,2.1446
2020-12-28,1.962271,2.107331,1.9831,2.2471
2020-12-29,2.025607,2.229213,1.9777,2.1729
2020-12-30,2.014394,2.186202,1.9591,2.0751
2020-12-31,2.02177,2.131301,1.9397,2.0238


Since Ridge may prove worthy of further exploration, I will attempt some hyperparameter tuning there. If this cannot beat our random forest, we will next look at improvements to our feature engineering that might enhance results. 

In [59]:
for alphas in np.arange(0.01, 0.1, .01):
    model = Ridge(random_state=47, alpha=alphas)
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predict))
    print("alphas =", alphas, ":", rmse)

alphas = 0.01 : 0.16541239135093028
alphas = 0.02 : 0.16537118088988698
alphas = 0.03 : 0.16537319363372022
alphas = 0.04 : 0.1653939547632697
alphas = 0.05 : 0.16542419806960618
alphas = 0.060000000000000005 : 0.1654599007220746
alphas = 0.06999999999999999 : 0.1654991475537135
alphas = 0.08 : 0.16554095914171071
alphas = 0.09 : 0.16558480055268116


In [61]:
model = Ridge(random_state=47, alpha=0.02, fit_intercept=False)
model.fit(X_train, y_train)
predict = model.predict(X_test)   
rmse = np.sqrt(mean_squared_error(y_test, predict))
print(rmse)

0.15571850298293935


In [67]:
model = Ridge(random_state=47, alpha=0.02, normalize=True)
model.fit(X_train, y_train)
predict = model.predict(X_test)   
rmse = np.sqrt(mean_squared_error(y_test, predict))
print(rmse)

0.16957445606785154


In [68]:
for alphas in np.arange(0.2, .3, .01):
    model = Ridge(random_state=47, alpha=alphas, fit_intercept=False)
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predict))
    print("alphas =", alphas, ":", rmse)

alphas = 0.2 : 0.15500617105809353
alphas = 0.21000000000000002 : 0.15500824710437244
alphas = 0.22000000000000003 : 0.15501264923266267
alphas = 0.23000000000000004 : 0.1550192409821717
alphas = 0.24000000000000005 : 0.15502789547500959
alphas = 0.25000000000000006 : 0.15503849447556511
alphas = 0.26000000000000006 : 0.15505092759526948
alphas = 0.2700000000000001 : 0.15506509160890977
alphas = 0.2800000000000001 : 0.155080889859931
alphas = 0.2900000000000001 : 0.15509823173692394


In [75]:
for solver in ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']:
    model = Ridge(random_state=47, alpha=0.2, fit_intercept=False, solver=solver)
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predict))
    print("solver =", solver, ":", rmse)

solver = auto : 0.15500617105809353
solver = svd : 0.15500617105818848
solver = cholesky : 0.15500617105809353
solver = lsqr : 0.1612973699446333
solver = sparse_cg : 0.21845047639620313
solver = sag : 0.1584596385214715
solver = saga : 0.161870239826584


While this is far from exhaustive, it seems our best option is still to stick with RandomForestRegressor for our analysis. With that in mind, my next step will be further feature engineering to see if I can improve the accuracy further. 

# Searching for Best Features
## Auto Feature Engineering

In [44]:
'''df = pd.read_csv('cb_XTZUSD_daily.csv', index_col='date', parse_dates=[0])
df.drop('unix', axis=1, inplace=True)
df.index = pd.to_datetime(df.index)
df.sort_values(by='date', axis=0, inplace=True)
df.head()'''

"df = pd.read_csv('cb_XTZUSD_daily.csv', index_col='date', parse_dates=[0])\ndf.drop('unix', axis=1, inplace=True)\ndf.index = pd.to_datetime(df.index)\ndf.sort_values(by='date', axis=0, inplace=True)\ndf.head()"

In [45]:
def find_best_features(data, test_range):
    '''Takes a standard pandas dataframe and the amount of units to test. In this way, it sorts through our 
    make_features() function to find the best possible results without hyperparamter tuning.'''