In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json

In [2]:
# function courtesy of CryptoDataDownload.com
def fetch_daily_data(symbol):
    pair_split = symbol.split('/')  # symbol must be in format XXX/XXX ie. BTC/EUR
    symbol = pair_split[0] + '-' + pair_split[1]
    url = f'https://api.pro.coinbase.com/products/{symbol}/candles?granularity=86400'
    response = requests.get(url)
    if response.status_code == 200:  # check to make sure the response from server is good
        data = pd.DataFrame(json.loads(response.text), columns=['unix', 'low', 'high', 'open', 'close', 'volume'])
        data['date'] = pd.to_datetime(data['unix'], unit='s')  # convert to a readable date
        data['vol_fiat'] = data['volume'] * data['close']      # multiply the BTC volume by closing price to approximate fiat volume
                            
        # if we failed to get any data, print an error...otherwise write the file
        if data is None:
            print("Did not return any data from Coinbase for this symbol")
        else:
            data.to_csv(f'cb_{pair_split[0] + pair_split[1]}_daily.csv', index=False)
    else:
        print("Did not receieve OK response from Coinbase API")

<strong>Pulling Bitcoin</strong><br>
To get a baseline, first working with Bitcoin.

In [3]:
fetch_daily_data('BTC/USD')

In [4]:
btc = pd.read_csv('cb_BTCUSD_daily.csv', index_col='date', parse_dates=[0])

In [5]:
btc.head()

Unnamed: 0_level_0,unix,low,high,open,close,volume,vol_fiat
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-19,1611014400,36200.0,37857.0,36624.23,36726.37,14877.424538,546393800.0
2021-01-18,1610928000,34736.46,37402.0,35820.01,36624.23,16609.641084,608315300.0
2021-01-17,1610841600,33850.03,36860.0,36004.8,35820.0,19182.049347,687101000.0
2021-01-16,1610755200,35372.59,37948.0,36754.6,36006.94,20861.425452,751156100.0
2021-01-15,1610668800,34298.93,39697.0,39123.05,36754.67,36421.059188,1338644000.0


In [6]:
btc.drop('unix', axis=1, inplace=True)

In [7]:
btc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 2021-01-19 to 2020-03-26
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   low       300 non-null    float64
 1   high      300 non-null    float64
 2   open      300 non-null    float64
 3   close     300 non-null    float64
 4   volume    300 non-null    float64
 5   vol_fiat  300 non-null    float64
dtypes: float64(6)
memory usage: 16.4+ KB


In [8]:
btc.index = pd.to_datetime(btc.index)

In [9]:
btc.sort_values(by='date', axis=0, inplace=True)

In [10]:
btc.head()

Unnamed: 0_level_0,low,high,open,close,volume,vol_fiat
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-26,6520.13,6795.0,6691.71,6758.18,17456.117051,117971600.0
2020-03-27,6260.0,6880.0,6760.0,6372.36,20458.244021,130367300.0
2020-03-28,6030.0,6372.36,6372.36,6251.82,20353.874846,127248800.0
2020-03-29,5870.46,6279.96,6251.45,5877.21,16111.236378,94689120.0
2020-03-30,5853.0,6631.23,5878.98,6406.4,23659.802642,151574200.0


# Feature Engineering
Here we are designing our features to better predict the price.

In [11]:
def make_features(data, max_lag):
    #data['%change'] = (1 - (data['high'] / data['low'])) * -1
    #data['month'] = data.index.month
    data['dayofweek'] = data.index.dayofweek
    data['week_low_mean'] = data['low'].shift().rolling(7).mean()
    data['week_high_mean'] = data['high'].shift().rolling(7).mean()
    #data['month_low_mean'] = data['low'].shift().rolling(30).mean()
    #data['month_high_mean'] = data['high'].shift().rolling(30).mean()
    for lag in range(1, max_lag + 1):
        data['high_lag_{}'.format(lag)] = data['high'].shift(lag)
    for lag in range(1, max_lag + 1):
        data['low_lag_{}'.format(lag)] = data['low'].shift(lag)
    #for lag in range(1, max_lag + 1):
     #   data['change_lag_{}'.format(lag)] = data['%change'].shift(lag)

In [12]:
make_features(btc, 14)

In [13]:
btc.tail()

Unnamed: 0_level_0,low,high,open,close,volume,vol_fiat,dayofweek,week_low_mean,week_high_mean,high_lag_1,...,low_lag_5,low_lag_6,low_lag_7,low_lag_8,low_lag_9,low_lag_10,low_lag_11,low_lag_12,low_lag_13,low_lag_14
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-15,34298.93,39697.0,39123.05,36754.67,36421.059188,1338644000.0,4,34495.604286,39677.068571,40127.66,...,34444.0,38800.0,36565.08,36200.0,33352.54,29891.13,27678.0,32008.62,29039.0,28700.0
2021-01-16,35372.59,37948.0,36754.6,36006.94,20861.425452,751156100.0,5,34171.868571,39350.015714,39697.0,...,30100.0,34444.0,38800.0,36565.08,36200.0,33352.54,29891.13,27678.0,32008.62,29039.0
2021-01-17,33850.03,36860.0,36004.8,35820.0,19182.049347,687101000.0,6,33682.238571,38855.881429,37948.0,...,32500.0,30100.0,34444.0,38800.0,36565.08,36200.0,33352.54,29891.13,27678.0,32008.62
2021-01-18,34736.46,37402.0,35820.01,36624.23,16609.641084,608315300.0,0,33597.385714,38199.864286,36860.0,...,32309.04,32500.0,30100.0,34444.0,38800.0,36565.08,36200.0,33352.54,29891.13,27678.0
2021-01-19,36200.0,37857.0,36624.23,36726.37,14877.424538,546393800.0,1,34259.737143,38075.31,37402.0,...,36751.11,32309.04,32500.0,30100.0,34444.0,38800.0,36565.08,36200.0,33352.54,29891.13


In [14]:
btc.columns

Index(['low', 'high', 'open', 'close', 'volume', 'vol_fiat', 'dayofweek',
       'week_low_mean', 'week_high_mean', 'high_lag_1', 'high_lag_2',
       'high_lag_3', 'high_lag_4', 'high_lag_5', 'high_lag_6', 'high_lag_7',
       'high_lag_8', 'high_lag_9', 'high_lag_10', 'high_lag_11', 'high_lag_12',
       'high_lag_13', 'high_lag_14', 'low_lag_1', 'low_lag_2', 'low_lag_3',
       'low_lag_4', 'low_lag_5', 'low_lag_6', 'low_lag_7', 'low_lag_8',
       'low_lag_9', 'low_lag_10', 'low_lag_11', 'low_lag_12', 'low_lag_13',
       'low_lag_14'],
      dtype='object')

In [15]:
# Since we are trying to predict, many columns in here are not necessary and could impede our model
btc.drop(['vol_fiat', 'volume', 'close', 'open'], axis=1, inplace=True)

In [16]:
btc.columns

Index(['low', 'high', 'dayofweek', 'week_low_mean', 'week_high_mean',
       'high_lag_1', 'high_lag_2', 'high_lag_3', 'high_lag_4', 'high_lag_5',
       'high_lag_6', 'high_lag_7', 'high_lag_8', 'high_lag_9', 'high_lag_10',
       'high_lag_11', 'high_lag_12', 'high_lag_13', 'high_lag_14', 'low_lag_1',
       'low_lag_2', 'low_lag_3', 'low_lag_4', 'low_lag_5', 'low_lag_6',
       'low_lag_7', 'low_lag_8', 'low_lag_9', 'low_lag_10', 'low_lag_11',
       'low_lag_12', 'low_lag_13', 'low_lag_14'],
      dtype='object')

# Preparing Data for Model

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [18]:
train, test = train_test_split(btc, shuffle=False, test_size=0.1)
train = train.dropna()

In [19]:
X_train = train.drop(['low', 'high'], axis=1)
X_test = test.drop(['low', 'high'], axis=1)
y_train = train[['low', 'high']]
y_test = test[['low', 'high']]

In [20]:
model = RandomForestRegressor(random_state=47)

In [21]:
model.fit(X_train, y_train)
predict = model.predict(X_test)

In [22]:
rmse = np.sqrt(mean_squared_error(y_test, predict))
rmse

10462.799786584246

In [23]:
predict

array([[22055.7383, 23435.9814],
       [21636.5543, 23053.3662],
       [21395.072 , 22939.3682],
       [21409.3181, 22902.2655],
       [21623.5753, 22983.0153],
       [21981.1719, 23493.6829],
       [22271.8516, 23779.205 ],
       [21997.4022, 23642.9184],
       [22019.9822, 23649.819 ],
       [22019.9822, 23649.819 ],
       [22095.9822, 23704.4202],
       [22132.7019, 23698.712 ],
       [22170.7019, 23726.0126],
       [22203.4918, 23743.4432],
       [22019.9822, 23649.819 ],
       [22019.9822, 23649.819 ],
       [22019.9822, 23649.819 ],
       [22095.9822, 23704.4202],
       [22132.7019, 23698.712 ],
       [22170.7019, 23726.0126],
       [22203.4918, 23743.4432],
       [22019.9822, 23649.819 ],
       [22019.9822, 23649.819 ],
       [22019.9822, 23649.819 ],
       [22095.9822, 23704.4202],
       [22132.7019, 23698.712 ],
       [22170.7019, 23726.0126],
       [22203.4918, 23743.4432],
       [22019.9822, 23649.819 ],
       [22019.9822, 23649.819 ]])