In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json

In [2]:
# function courtesy of CryptoDataDownload.com
def fetch_daily_data(symbol):
    pair_split = symbol.split('/')  # symbol must be in format XXX/XXX ie. BTC/EUR
    symbol = pair_split[0] + '-' + pair_split[1]
    url = f'https://api.pro.coinbase.com/products/{symbol}/candles?granularity=86400'
    response = requests.get(url)
    if response.status_code == 200:  # check to make sure the response from server is good
        data = pd.DataFrame(json.loads(response.text), columns=['unix', 'low', 'high', 'open', 'close', 'volume'])
        data['date'] = pd.to_datetime(data['unix'], unit='s')  # convert to a readable date
        data['vol_fiat'] = data['volume'] * data['close']      # multiply the BTC volume by closing price to approximate fiat volume
                            
        # if we failed to get any data, print an error...otherwise write the file
        if data is None:
            print("Did not return any data from Coinbase for this symbol")
        else:
            data.to_csv(f'cb_{pair_split[0] + pair_split[1]}_daily.csv', index=False)
    else:
        print("Did not receieve OK response from Coinbase API")

## Pulling Tezos
Trying Tezos next.

In [3]:
fetch_daily_data('XTZ/USD')

In [4]:
df = pd.read_csv('cb_XTZUSD_daily.csv', index_col='date', parse_dates=[0])

In [5]:
df.head()

Unnamed: 0_level_0,unix,low,high,open,close,volume,vol_fiat
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-20,1611100800,2.6444,2.9496,2.8723,2.8772,4913515.79,14137170.0
2021-01-19,1611014400,2.8724,3.22,3.0346,2.8731,6922518.64,19889090.0
2021-01-18,1610928000,2.845,3.1175,2.9726,3.0342,5120896.85,15537830.0
2021-01-17,1610841600,2.6851,3.1136,2.853,2.9738,6827360.01,20303200.0
2021-01-16,1610755200,2.6904,3.1882,2.7064,2.854,13772368.03,39306340.0


In [6]:
df.drop('unix', axis=1, inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 2021-01-20 to 2020-03-27
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   low       300 non-null    float64
 1   high      300 non-null    float64
 2   open      300 non-null    float64
 3   close     300 non-null    float64
 4   volume    300 non-null    float64
 5   vol_fiat  300 non-null    float64
dtypes: float64(6)
memory usage: 16.4+ KB


In [8]:
df.index = pd.to_datetime(df.index)

In [9]:
df.sort_values(by='date', axis=0, inplace=True)

In [10]:
df.head()

Unnamed: 0_level_0,low,high,open,close,volume,vol_fiat
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-27,1.5713,1.7897,1.7645,1.5897,3969087.96,6309659.0
2020-03-28,1.5052,1.6226,1.5896,1.5655,3872624.59,6062594.0
2020-03-29,1.4664,1.5804,1.5693,1.4698,2250825.26,3308263.0
2020-03-30,1.461,1.665,1.4675,1.6077,2626418.2,4222493.0
2020-03-31,1.5532,1.6424,1.6081,1.611,1892308.96,3048510.0


# Feature Engineering
Here we are designing our features to better predict the price.

In [11]:
def make_features(data, max_lag):
    data['month'] = data.index.month
    data['dayofweek'] = data.index.dayofweek
    data['week_low_mean'] = data['low'].shift().rolling(7).mean()
    data['week_high_mean'] = data['high'].shift().rolling(7).mean()
    data['bi_week_low_mean'] = data['low'].shift().rolling(14).mean()
    data['bi_week_high_mean'] = data['high'].shift().rolling(14).mean()
    data['month_low_mean'] = data['low'].shift().rolling(28).mean()
    data['month_high_mean'] = data['high'].shift().rolling(28).mean()
    for lag in range(1, max_lag + 1):
        data['high_lag_{}'.format(lag)] = data['high'].shift(lag)
    for lag in range(1, max_lag + 1):
        data['low_lag_{}'.format(lag)] = data['low'].shift(lag)

In [12]:
make_features(df, 14)

In [13]:
df.tail()

Unnamed: 0_level_0,low,high,open,close,volume,vol_fiat,month,dayofweek,week_low_mean,week_high_mean,...,low_lag_5,low_lag_6,low_lag_7,low_lag_8,low_lag_9,low_lag_10,low_lag_11,low_lag_12,low_lag_13,low_lag_14
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-16,2.6904,3.1882,2.7064,2.854,13772368.03,39306340.0,1,5,2.306043,2.7003,...,2.0583,2.3918,2.418,2.25,2.405,2.2972,2.1361,2.0158,1.9538,1.95
2021-01-17,2.6851,3.1136,2.853,2.9738,6827360.01,20303200.0,1,6,2.344957,2.770271,...,2.1947,2.0583,2.3918,2.418,2.25,2.405,2.2972,2.1361,2.0158,1.9538
2021-01-18,2.845,3.1175,2.9726,3.0342,5120896.85,15537830.0,1,0,2.386857,2.793486,...,2.2276,2.1947,2.0583,2.3918,2.418,2.25,2.405,2.2972,2.1361,2.0158
2021-01-19,2.8724,3.22,3.0346,2.8731,6922518.64,19889090.0,1,1,2.499243,2.843014,...,2.4061,2.2276,2.1947,2.0583,2.3918,2.418,2.25,2.405,2.2972,2.1361
2021-01-20,2.6444,2.9496,2.8723,2.8772,4913515.79,14137170.0,1,2,2.596057,2.952714,...,2.4458,2.4061,2.2276,2.1947,2.0583,2.3918,2.418,2.25,2.405,2.2972


In [14]:
df.columns

Index(['low', 'high', 'open', 'close', 'volume', 'vol_fiat', 'month',
       'dayofweek', 'week_low_mean', 'week_high_mean', 'bi_week_low_mean',
       'bi_week_high_mean', 'month_low_mean', 'month_high_mean', 'high_lag_1',
       'high_lag_2', 'high_lag_3', 'high_lag_4', 'high_lag_5', 'high_lag_6',
       'high_lag_7', 'high_lag_8', 'high_lag_9', 'high_lag_10', 'high_lag_11',
       'high_lag_12', 'high_lag_13', 'high_lag_14', 'low_lag_1', 'low_lag_2',
       'low_lag_3', 'low_lag_4', 'low_lag_5', 'low_lag_6', 'low_lag_7',
       'low_lag_8', 'low_lag_9', 'low_lag_10', 'low_lag_11', 'low_lag_12',
       'low_lag_13', 'low_lag_14'],
      dtype='object')

In [15]:
# Since we are trying to predict, many columns in here are not necessary and could impede our model
df.drop(['vol_fiat', 'volume', 'close', 'open'], axis=1, inplace=True)

In [16]:
df.columns

Index(['low', 'high', 'month', 'dayofweek', 'week_low_mean', 'week_high_mean',
       'bi_week_low_mean', 'bi_week_high_mean', 'month_low_mean',
       'month_high_mean', 'high_lag_1', 'high_lag_2', 'high_lag_3',
       'high_lag_4', 'high_lag_5', 'high_lag_6', 'high_lag_7', 'high_lag_8',
       'high_lag_9', 'high_lag_10', 'high_lag_11', 'high_lag_12',
       'high_lag_13', 'high_lag_14', 'low_lag_1', 'low_lag_2', 'low_lag_3',
       'low_lag_4', 'low_lag_5', 'low_lag_6', 'low_lag_7', 'low_lag_8',
       'low_lag_9', 'low_lag_10', 'low_lag_11', 'low_lag_12', 'low_lag_13',
       'low_lag_14'],
      dtype='object')

# Preparing Data for Model

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [18]:
train, test = train_test_split(df, shuffle=False, test_size=0.1)
train = train.dropna()

In [19]:
X_train = train.drop(['low', 'high'], axis=1)
X_test = test.drop(['low', 'high'], axis=1)
y_train = train[['low', 'high']]
y_test = test[['low', 'high']]

In [20]:
model = RandomForestRegressor(random_state=47)

In [21]:
model.fit(X_train, y_train)
predict = model.predict(X_test)

In [22]:
rmse = np.sqrt(mean_squared_error(y_test, predict))
rmse

0.15331193961522796

In [23]:
predict

array([[2.12702 , 2.28307 ],
       [2.083298, 2.19611 ],
       [1.978885, 2.142305],
       [1.894468, 2.043957],
       [1.990918, 2.137352],
       [1.863471, 2.004373],
       [1.991336, 2.175085],
       [2.074351, 2.221489],
       [2.036775, 2.191802],
       [1.895414, 2.045679],
       [1.86691 , 2.027007],
       [1.990305, 2.156624],
       [1.896693, 2.050752],
       [2.015172, 2.173286],
       [2.116996, 2.290443],
       [2.221918, 2.463395],
       [2.359283, 2.678284],
       [2.394722, 2.742321],
       [2.291628, 2.552671],
       [2.416487, 2.717891],
       [2.41578 , 2.784421],
       [2.318004, 2.579011],
       [2.274018, 2.455458],
       [2.288759, 2.452641],
       [2.415353, 2.646091],
       [2.448059, 2.742979],
       [2.791797, 3.081628],
       [2.787166, 3.096712],
       [2.831848, 3.146493],
       [2.901752, 3.165207]])

# Hyperparamter Tuning
## Sticking with Random Forest

# Searching for Best Features
## Auto Feature Engineering

In [24]:
df = pd.read_csv('cb_XTZUSD_daily.csv', index_col='date', parse_dates=[0])
df.drop('unix', axis=1, inplace=True)
df.index = pd.to_datetime(df.index)
df.sort_values(by='date', axis=0, inplace=True)
df.head()

Unnamed: 0_level_0,low,high,open,close,volume,vol_fiat
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-27,1.5713,1.7897,1.7645,1.5897,3969087.96,6309659.0
2020-03-28,1.5052,1.6226,1.5896,1.5655,3872624.59,6062594.0
2020-03-29,1.4664,1.5804,1.5693,1.4698,2250825.26,3308263.0
2020-03-30,1.461,1.665,1.4675,1.6077,2626418.2,4222493.0
2020-03-31,1.5532,1.6424,1.6081,1.611,1892308.96,3048510.0


In [None]:
def find_best_features(data, test_range):
    '''Takes a standard pandas dataframe and the amount of units to test. In this way, it sorts through our 
    make_features() function to find the best possible results without hyperparamter tuning.'''