# Bitcoin price prediction

Examine bitcoin prices and see if we can predict its future price.

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
%%time
# Resampling data from minute interval to day
bit_df = pd.read_csv('Data/coinbaseUSD_1-min_data_2014-12-01_to_2018-01-08.csv')
bit_df.head(20)

Wall time: 2.64 s


Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
0,1417411980,300.0,300.0,300.0,300.0,0.01,3.0,300.0
1,1417412040,300.0,300.0,300.0,300.0,0.01,3.0,300.0
2,1417412100,300.0,300.0,300.0,300.0,0.01,3.0,300.0
3,1417412160,300.0,300.0,300.0,300.0,0.01,3.0,300.0
4,1417412220,300.0,300.0,300.0,300.0,0.01,3.0,300.0
5,1417412280,300.0,300.0,300.0,300.0,0.01,3.0,300.0
6,1417412340,300.0,300.0,300.0,300.0,0.01,3.0,300.0
7,1417412400,300.0,300.0,300.0,300.0,0.01,3.0,300.0
8,1417412460,300.0,300.0,300.0,300.0,0.01,3.0,300.0
9,1417412520,300.0,300.0,300.0,300.0,0.01,3.0,300.0


In [4]:
bit_df.tail()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
1574269,1515369360,16174.21,16174.23,16174.21,16174.23,7.594119,122828.95677,16174.221301
1574270,1515369420,16174.23,16174.23,16174.21,16174.22,11.902468,192513.15094,16174.221081
1574271,1515369480,16174.22,16174.22,16174.21,16174.21,3.86084,62446.073684,16174.218136
1574272,1515369540,16174.22,16174.22,16174.21,16174.22,1.179093,19070.914509,16174.219514
1574273,1515369600,16174.22,16174.23,16174.22,16174.22,5.401224,87360.593222,16174.220219


In [5]:
%%time
# Convert unix time to datetime
bit_df['date'] = pd.to_datetime(bit_df.Timestamp, unit='s')
# Reset index
bit_df = bit_df.set_index('date')
# Rename columns so easier to code
bit_df = bit_df.rename(columns={'Open':'open', 'High': 'hi', 'Low': 'lo', 
                       'Close': 'close', 'Volume_(BTC)': 'vol_btc',
                       'Volume_(Currency)': 'vol_cur', 
                       'Weighted_Price': 'wp', 'Timestamp': 'ts'})
# Resample and only use recent samples that aren't missing
bit_df = bit_df.resample('d').agg({'open': 'first', 'hi': 'max', 
    'lo': 'min', 'close': 'last', 'vol_btc': 'sum',
    'vol_cur': 'sum', 'wp': 'mean', 'ts': 'min'}).iloc[-1000:]
# drop last row as it is not complete
bit_df = bit_df.iloc[:-1]

Wall time: 651 ms


In [6]:
bit_df.shape

(999, 8)

In [7]:
bit_df.head()

Unnamed: 0_level_0,open,hi,lo,close,vol_btc,vol_cur,wp,ts
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-04-15,221.05,224.52,218.09,224.35,9137.584626,2029524.0,222.270771,1429056000.0
2015-04-16,224.35,230.75,223.66,229.45,8636.837666,1970069.0,227.779413,1429142000.0
2015-04-17,229.45,230.08,220.46,223.62,7738.360003,1741497.0,224.970863,1429229000.0
2015-04-18,223.55,224.04,222.18,224.02,5065.649127,1131269.0,223.39817,1429315000.0
2015-04-19,224.02,226.72,223.44,224.03,5279.787312,1185860.0,224.667792,1429402000.0


In [8]:
bit_df.tail()

Unnamed: 0_level_0,open,hi,lo,close,vol_btc,vol_cur,wp,ts
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-03,14781.52,15400.0,14628.0,15098.14,17616.327677,264226400.0,14989.93935,1514938000.0
2018-01-04,15098.23,15400.0,14230.0,15144.99,19567.016783,290904500.0,14851.47788,1515024000.0
2018-01-05,15145.0,17178.0,14819.78,16960.01,22588.046307,361988800.0,15796.596968,1515110000.0
2018-01-06,16960.01,17174.0,16251.01,17098.99,13744.804428,229688500.0,16661.988247,1515197000.0
2018-01-07,17099.0,17115.01,15755.01,16174.22,12201.377159,200282500.0,16526.973583,1515283000.0


## Predict tomorrow's close based on today's info
First, predict the open price from the ts component.

In [9]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics

In [10]:
bit_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 999 entries, 2015-04-15 to 2018-01-07
Freq: D
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   open     999 non-null    float64
 1   hi       999 non-null    float64
 2   lo       999 non-null    float64
 3   close    999 non-null    float64
 4   vol_btc  999 non-null    float64
 5   vol_cur  999 non-null    float64
 6   wp       999 non-null    float64
 7   ts       999 non-null    float64
dtypes: float64(8)
memory usage: 70.2 KB


In [11]:
bit_df.describe()

Unnamed: 0,open,hi,lo,close,vol_btc,vol_cur,wp,ts
count,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
mean,1838.11015,1914.705946,1757.861101,1854.054895,10350.019927,35537350.0,1842.224578,1472170000.0
std,3169.30283,3346.396825,2966.458499,3201.173766,9947.604406,109736500.0,3165.632076,24929060.0
min,211.16,224.04,0.06,211.16,1525.187302,792546.5,215.969046,1429056000.0
25%,384.615,393.5,371.3,384.85,5320.152105,2297687.0,384.227045,1450613000.0
50%,629.79,639.85,618.13,630.83,7367.614526,3484114.0,632.673702,1472170000.0
75%,1454.145,1512.965,1440.095,1502.495,11954.181722,19858040.0,1471.613323,1493726000.0
max,19650.02,19891.99,19010.0,19650.01,165763.024424,1393385000.0,19537.276941,1515283000.0


In [12]:
bit_df.corr()

Unnamed: 0,open,hi,lo,close,vol_btc,vol_cur,wp,ts
open,1.0,0.997562,0.996579,0.99714,0.526337,0.836192,0.9988,0.650204
hi,0.997562,1.0,0.996036,0.999194,0.540711,0.852631,0.998989,0.646562
lo,0.996579,0.996036,1.0,0.997887,0.503904,0.807838,0.998538,0.658249
close,0.99714,0.999194,0.997887,1.0,0.527407,0.838351,0.999277,0.650609
vol_btc,0.526337,0.540711,0.503904,0.527407,1.0,0.689433,0.52529,0.347119
vol_cur,0.836192,0.852631,0.807838,0.838351,0.689433,1.0,0.833692,0.445663
wp,0.9988,0.998989,0.998538,0.999277,0.52529,0.833692,1.0,0.652363
ts,0.650204,0.646562,0.658249,0.650609,0.347119,0.445663,0.652363,1.0


In [13]:
full_x = bit_df[['hi', 'lo', 'close', 'vol_btc', 'vol_cur', 'wp', 'ts']]
full_y = bit_df['open']

In [14]:
from sklearn.model_selection import train_test_split
full_x_train, full_x_test, full_y_train, full_t_test = train_test_split(full_x, full_y, random_state = 2)

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
full_x_train = scaler.fit_transform(full_x_train)
full_x_test = scaler.transform(full_x_test)

In [16]:
full_x_train.shape

(749, 7)

In [17]:
x_train = full_x_train[:, 6]
x_test = full_x_test[:, 6]
y_train = full_y_train
y_test = full_t_test

## First part : Predict the open price from the ts component.

In [18]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
x_train = x_train.reshape(-1, 1)
model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
x_test = x_test.reshape(-1, 1)
predicted = model.predict(x_test)

In [20]:
from sklearn.metrics import mean_absolute_error as mae

In [21]:
mae(predicted, y_test)

1419.2864831618238

## Second Part : Predict tomorrow's close price by today's data

In [22]:
full_x = bit_df[['open', 'hi', 'lo', 'close', 'vol_btc', 'vol_cur', 'wp', 'ts']]
full_y = bit_df['close']

In [23]:
full_x.shape

(999, 8)

In [24]:
full_y.shape

(999,)

### We are trying to predict the close price for the next day based on the previous day's data, so we will take training data according to that

In [25]:
full_x = full_x.iloc[:998, :]    # we are taking all rows except the last as the feature
full_y = full_y[1:]              # the variable to be predicted is in the immediate next row

In [26]:
full_x.shape

(998, 8)

In [27]:
full_y.shape

(998,)

In [28]:
x_train, x_test, y_train, y_test = train_test_split(full_x, full_y, random_state = 2)

In [29]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

In [30]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [31]:
predicted = model.predict(X_test)

In [32]:
mae(predicted, y_test)

85.37116093152476

## Since the mean absolute error is satisfactory we can now train the model using all the data that we have in order to make accurate prediction

### To predict what will be tomorrow's closing price we will pass all the values for today (i.e. the last row of bit_df) to the model

In [33]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(full_x)
feed = scaler.transform(bit_df.iloc[-2:])

In [34]:
model = LinearRegression()
model.fit(X_train, full_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [35]:
feed

array([[4.83271523, 4.6124446 , 4.94593735, 4.81534861, 0.34145895,
        1.77275702, 4.73750721, 1.73031615],
       [4.87708932, 4.59463141, 4.77683584, 4.52352037, 0.18630058,
        1.50448508, 4.69438922, 1.7337872 ]])

In [36]:
y = model.predict(feed)

the actual close price of the last data point is

In [37]:
full_y[-1]

16174.22

predicted close price for last data point is

In [38]:
y[-2]

17388.196250754147

#### finally this is the prediction of tomorrow's close price based on today's data i.e the data from the last row 

In [39]:
y[-1]

15999.740639953881

## This is the predicted tomorrow's close price