In [496]:
#VIZ LIBRARY
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm as tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

#Preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline


#CLASSICAL STATS
import scipy
import statsmodels
from scipy import signal
import statsmodels.api as sm
from fbprophet import Prophet
from scipy.signal import butter, deconvolve
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.api import SimpleExpSmoothing, Holt
from statsmodels.tsa.holtwinters import ExponentialSmoothing

#DEEP LEARNING LIB
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model

#METRICS
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report, r2_score,mean_absolute_error,mean_squared_error


import warnings 
warnings.filterwarnings('ignore')

In [2]:
#read the data
df = pd.read_csv('data/coinbaseUSD_1-min_data_2014-12-01_to_2019-01-09.csv')

In [3]:
df.head()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
0,1417411980,300.0,300.0,300.0,300.0,0.01,3.0,300.0
1,1417412040,,,,,,,
2,1417412100,,,,,,,
3,1417412160,,,,,,,
4,1417412220,,,,,,,


In [4]:
df['date'] = pd.to_datetime(df['Timestamp'], unit='s').dt.date

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2099760 entries, 0 to 2099759
Data columns (total 9 columns):
Timestamp            int64
Open                 float64
High                 float64
Low                  float64
Close                float64
Volume_(BTC)         float64
Volume_(Currency)    float64
Weighted_Price       float64
date                 object
dtypes: float64(7), int64(1), object(1)
memory usage: 144.2+ MB


In [6]:
df.shape

(2099760, 9)

In [7]:
df.describe().drop('Timestamp',axis=1)

Unnamed: 0,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
count,1990691.0,1990691.0,1990691.0,1990691.0,1990691.0,1990691.0,1990691.0
mean,3246.403,3247.829,3244.856,3246.403,7.849139,36001.57,3246.341
std,3799.154,3801.394,3796.761,3799.15,18.73222,140187.9,3799.078
min,0.06,0.06,0.06,0.06,1e-08,2.6417e-06,0.06
25%,419.58,419.64,419.5,419.57,0.9024,644.0031,419.5645
50%,1014.58,1014.89,1014.15,1014.53,2.6929,3695.642,1014.512
75%,6322.63,6324.01,6321.09,6322.67,7.600965,19723.92,6322.55
max,19891.99,19891.99,19891.98,19891.99,1563.267,19970760.0,19891.99


### CHEKING NULL VALUES

In [8]:
df.isnull().sum()

Timestamp                 0
Open                 109069
High                 109069
Low                  109069
Close                109069
Volume_(BTC)         109069
Volume_(Currency)    109069
Weighted_Price       109069
date                      0
dtype: int64

In [9]:
#Approximately 5% of our data are missing

In [38]:
#getting the real price of bitcoin
real_price = df.groupby('date')['Weighted_Price'].mean()
bitcoin_df = pd.DataFrame(data = real_price)
bitcoin_df = bitcoin_df.reset_index()
bitcoin_df.columns = ['date', 'price']

bitcoin_df.head(3)

Unnamed: 0,date,price
0,2014-12-01,335.0
1,2014-12-02,377.854911
2,2014-12-03,377.818333


### EDA

In [44]:
fig = go.Figure()

fig.add_trace(go.Scatter(x = bitcoin_df.date, y = bitcoin_df.price, marker_color='lightgreen'))

fig.update_layout(title = 'BITCOIN PRICE VS TIME', template = 'plotly_dark', height=500)
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Bitcoin Price')

fig.show()

#### SNIPPET GRAPH FOR THE LAST 30 DAYS

In [50]:
fig = go.Figure()

fig.add_trace(go.Scatter(x = bitcoin_df['date'][-30:], y = bitcoin_df['price'][-30:], marker_color='lightgreen'))

fig.update_layout(title = 'BITCOIN PRICE(LAST 30 DAYS) VS TIME', template = 'plotly_dark', height=300)
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Bitcoin Price')

fig.show()

## FORECASTING (CLASSICAL STATATISTICS METHOD)

In [253]:
df_train = bitcoin_df[:-30]
df_test = bitcoin_df[-30:]

### NAIVE APPROACH

In [466]:
predictions = []

for i in range(len(df_test.date)):
    
    if i == 0:
        predictions.append(df_train['price'].values[-1])
    else:
        predictions.append(df_test['price'].values[i-1])
        
predictions = np.transpose(np.array([row.tolist() for row in predictions]))

In [467]:
fig = go.Figure()

fig.add_trace(go.Scatter(x = df_test['date'], y = df_test['price'], marker_color='lightgreen', name='Original Data'))
fig.add_trace(go.Scatter(x = df_test['date'], y = predictions, marker_color='violet', name='Forecast'))

fig.update_layout(title = 'PREDICTED DATA vs ORIGINAL DATA', template = 'plotly_dark', height=300)
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Bitcoin Price')

fig.show()

In [468]:
nb_error_rate = abs(((df_test.price.values - predictions) / df_test.price.values).mean()) * 100
print('MAPE: ', round(nb_error_rate,2), '%')

MAPE:  0.53 %


In [471]:
nb_mae = mean_absolute_error(predictions, df_test.price.values)
nb_mse = mean_squared_error(predictions, df_test.price.values)
nb_rmse = np.sqrt(mean_squared_error(predictions, df_test.price.values))

print('Mean Absolute Error:   ', nb_mae)
print('Mean Squared Error:   ', nb_mse)
print('Root Mean Squared Error:   ', nb_rmse)

Mean Absolute Error:    93.79345920781715
Mean Squared Error:    14700.643747052169
Root Mean Squared Error:    121.24621126885643


### MOVING AVERAGE METHOD

In [472]:
predictions = []
for i in range(len(df_test.date)):
    if i == 0:
        predictions.append(np.mean(df_train['price'][-30:].values))
    if i < (len(df_test.price)+1) and i > 0:
        predictions.append(0.5 * (np.mean(df_train['price'][-30+i:].values) + \
                                  np.mean(predictions[:i])))
    if i > (len(df_test.price)+1):
        predictions.append(np.mean(predictions))

In [473]:
predictions = np.transpose(np.array([row.tolist() for row in predictions]))

In [474]:
fig = go.Figure()

fig.add_trace(go.Scatter(x = df_test['date'], y = df_test['price'], marker_color='lightgreen', name='Original Data'))
fig.add_trace(go.Scatter(x = df_test['date'], y = predictions, marker_color='violet', name='Forecast'))

fig.update_layout(title = 'PREDICTED DATA vs ORIGINAL DATA', template = 'plotly_dark', height=300)
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Bitcoin Price')

fig.show()

In [475]:
mave_error_rate = abs(((df_test.price.values - predictions) / df_test.price.values).mean()) * 100
print('MAPE: ', round(mave_error_rate,2), '%')

MAPE:  15.54 %


In [478]:
mave_mae = mean_absolute_error(predictions, df_test.price.values)
mave_mse = mean_squared_error(predictions, df_test.price.values)
mave_rmse = np.sqrt(mean_squared_error(predictions, df_test.price.values))

print('Mean Absolute Error:   ', mave_mae)
print('Mean Squared Error:   ', mave_mse)
print('Root Mean Squared Error:   ', mave_rmse)

Mean Absolute Error:    555.8210831911916
Mean Squared Error:    510577.08270188636
Root Mean Squared Error:    714.5467673300932


### HOLTLINEAR

In [487]:
predictions = []

fit = Holt(np.asarray(df_train['price'][-30:])).fit(smoothing_level = 0.1,smoothing_slope = 0.01)
predictions = fit.forecast(len(df_test.price))

In [488]:
fig = go.Figure()

fig.add_trace(go.Scatter(x = df_test['date'], y = df_test['price'], marker_color='lightgreen', name='Original Data'))
fig.add_trace(go.Scatter(x = df_test['date'], y = predictions, marker_color='violet', name='Forecast'))

fig.update_layout(title = 'PREDICTED DATA vs ORIGINAL DATA', template = 'plotly_dark', height=300)
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Bitcoin Price')

fig.show()

In [489]:
ht_error_rate = abs(((df_test.price.values - predictions) / df_test.price.values).mean()) * 100
print('MAPE:', round(ht_error_rate,2), '%')

MAPE: 3.99 %


In [490]:
ht_mae = mean_absolute_error(predictions, df_test.price.values)
ht_mse = mean_squared_error(predictions, df_test.price.values)
ht_rmse = np.sqrt(mean_squared_error(predictions, df_test.price.values))

print('Mean Absolute Error:   ', ht_mae)
print('Mean Squared Error:   ', ht_mse)
print('Root Mean Squared Error:   ', ht_rmse)

Mean Absolute Error:    286.9598380313562
Mean Squared Error:    140883.52720141326
Root Mean Squared Error:    375.34454465385966


### HOLT-WINTER (EXPONENTIAL SMOOTHING)

In [531]:
predictions = []

fit = ExponentialSmoothing(np.asarray(df_train.price) ,seasonal_periods=7,trend='add', seasonal='add',).fit()
predictions = fit.forecast(len(df_test.price))

In [532]:
fig = go.Figure()

fig.add_trace(go.Scatter(x = df_test['date'], y = df_test['price'], marker_color='lightgreen', name='Original Data'))
fig.add_trace(go.Scatter(x = df_test['date'], y = predictions, marker_color='violet', name='Forecast'))

fig.update_layout(title = 'PREDICTED DATA vs ORIGINAL DATA', template = 'plotly_dark', height=300)
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Bitcoin Price')

fig.show()

In [533]:
hw_error_rate = abs(((df_test.price.values - predictions) / df_test.price.values).mean()) * 100
print('MAPE:', round(hw_error_rate,2), '%')

MAPE: 7.8 %


In [534]:
hw_mae = mean_absolute_error(predictions, df_test.price.values)
hw_mse = mean_squared_error(predictions, df_test.price.values)
hw_rmse = np.sqrt(mean_squared_error(predictions, df_test.price.values))

print('Mean Absolute Error:   ', hw_mae)
print('Mean Squared Error:   ', hw_mse)
print('Root Mean Squared Error:   ', hw_rmse)

Mean Absolute Error:    341.35017591279785
Mean Squared Error:    157848.74334800063
Root Mean Squared Error:    397.30182902674966


### ARIMA

In [580]:
predictions = []
fit = sm.tsa.statespace.SARIMAX(df_train.price[-30:].values, order=(1, 0, 0),seasonal_order=(0, 1, 1, 7)).fit()
predictions.append(fit.forecast(30))
predictions = np.array(predictions).reshape((30, ))

In [581]:
fig = go.Figure()

fig.add_trace(go.Scatter(x = df_test['date'], y = df_test['price'], marker_color='lightgreen', name='Original Data'))
fig.add_trace(go.Scatter(x = df_test['date'], y = predictions, marker_color='violet', name='Forecast'))

fig.update_layout(title = 'PREDICTED DATA vs ORIGINAL DATA', template = 'plotly_dark', height=300)
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Bitcoin Price')

fig.show()

In [582]:
arima_error_rate = abs(((df_test.price.values - predictions) / df_test.price.values).mean()) * 100
print('MAPE:', round(arima_error_rate,2), '%')

MAPE: 11.09 %


In [583]:
arima_mae = mean_absolute_error(predictions, df_test.price.values)
arima_mse = mean_squared_error(predictions, df_test.price.values)
arima_rmse = np.sqrt(mean_squared_error(predictions, df_test.price.values))

print('Mean Absolute Error:   ', arima_mae)
print('Mean Squared Error:   ', arima_mse)
print('Root Mean Squared Error:   ', arima_rmse)

Mean Absolute Error:    392.7553271072802
Mean Squared Error:    206119.81043164458
Root Mean Squared Error:    454.00419649122693


### FBPROPHET

In [655]:

predictions = []
dates = df_train.date[-30:]
data = df_train.price[-30:].values

df = pd.DataFrame()
df['ds'], df['y'] = df_test.date,data.tolist()
model = Prophet(daily_seasonality=True)
model.fit(df)
future = model.make_future_dataframe(periods=30)
forecast = model.predict(future)["yhat"].loc[30:].values
predictions.append(forecast)
predictions = np.array(predictions).reshape(30,)

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:n_changepoints greater than number of observations. Using 23.


In [656]:
fig = go.Figure()

fig.add_trace(go.Scatter(x = df_test['date'], y = df_test['price'], marker_color='lightgreen', name='Original Data'))
fig.add_trace(go.Scatter(x = df_test['date'], y = predictions, marker_color='violet', name='Forecast'))

fig.update_layout(title = 'PREDICTED DATA vs ORIGINAL DATA', template = 'plotly_dark', height=300)
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Bitcoin Price')

fig.show()

In [657]:
fb_error_rate = abs(((df_test.price.values - predictions) / df_test.price.values).mean()) * 100
print('MAPE:', round(fb_error_rate,2), '%')

MAPE: 23.88 %


In [659]:
fb_mae = mean_absolute_error(predictions, df_test.price.values)
fb_mse = mean_squared_error(predictions, df_test.price.values)
fb_rmse = np.sqrt(mean_squared_error(predictions, df_test.price.values))

print('Mean Absolute Error:   ', fb_mae)
print('Mean Squared Error:   ', fb_mse)
print('Root Mean Squared Error:   ', fb_rmse)

Mean Absolute Error:    922.5354031146916
Mean Squared Error:    1243887.9431874955
Root Mean Squared Error:    1115.2972443198698


### MODEL COMPARISSON BY MAPE

In [669]:
models = ['NAIVE APPROACH', 'MOVING AVE.', 'HOLT-LINEAR', 'HOLT-WINTER',
          'ARIMA', 'FBPROPHET']

err_rate = [nb_error_rate, mave_error_rate, ht_error_rate, hw_error_rate,
            arima_error_rate, fb_error_rate]
err_rate = [round(err,2) for err in err_rate]

In [716]:
#SHOW FIG
colors = ['rgb(255,50,50)','rgb(50,1,408)', 'rgb(255,80,255)', 'rgb(23,255,255)',
          'rgb(125,3,255)', 'rgb(50,255,70)']
fig = go.Figure()

fig.add_trace(go.Bar(x = models, y=err_rate, marker_color=colors, text=err_rate))

fig.update_xaxes(title='Model')
fig.update_yaxes(title='Error %')

fig.update_layout(title = 'MODEL COMPARISON BY MAPE', template='plotly_dark', height=400)
fig.show()

In [717]:
#END