# Ethereum Price Regression model
The aim of this notebook is to build a model that can predict the price of the cryptocurreny Ethereum, given past data (4 years of daily price data).
> How well can we predict the closing price of one Ether, given its previous closing price data?

<hr/>

Data can be source from Yahoo at this link: https://au.finance.yahoo.com/quote/ETH-AUD/history?period1=1541376000&period2=1604534400&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true, between the periods of 18/11/2016 to 05/11/2020.

### Setup up env and look at data

In [88]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [86]:
# Import preprocessed data (if it exists)
df = pd.read_csv("data/processed_ETH-AUD",
                    low_memory=False)

In [89]:
price_data = pd.read_csv("./data/ETH-AUD-2016to2020.csv",
                         low_memory=False)

price_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2016-11-18,13.501419,13.527241,12.809084,12.901511,12.901511,14725799
1,2016-11-19,12.910578,13.268645,12.909572,13.155561,13.155561,7061320
2,2016-11-20,13.153451,13.221361,13.009252,13.079301,13.079301,7063653
3,2016-11-21,13.080503,13.157393,12.946662,13.046948,13.046948,4967737
4,2016-11-22,13.053648,13.856814,12.990789,13.371652,13.371652,15232721


In [4]:
price_data.isna().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [34]:
print(price_data.shape)
print()
print(price_data.dtypes)
print()
print(price_data.info());

(732, 7)

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       732 non-null    object 
 1   Open       732 non-null    float64
 2   High       732 non-null    float64
 3   Low        732 non-null    float64
 4   Close      732 non-null    float64
 5   Adj Close  732 non-null    float64
 6   Volume     732 non-null    int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 40.2+ KB
None


In [90]:
# Function for precprocessing the data
def preprocess_data(df):
    """
    Performs transformations on df and returns transformed df.
    """
    
    # Drop Adj Close (duplicate data)
    price_data.drop(["Adj Close"], axis=1, inplace=True)
    
    # Transform Date into pd.readable form
    price_data["Date"] = pd.to_datetime(price_data["Date"])
    df["saleYear"] = df.Date.dt.year
    df["saleMonth"] = df.Date.dt.month
    df["saleDay"] = df.Date.dt.day
    df["saleDayOfWeek"] = df.Date.dt.dayofweek
    df["saleDayOfYear"] = df.Date.dt.dayofyear
    # Drop Date
    df.drop(["Date"], axis=1, inplace=True)
    
    # Scale Volume data between 0 and 1000
    min_max_scaler = MinMaxScaler(feature_range=(0, 1000))
    price_data["Volume"] = min_max_scaler.fit_transform(pd.DataFrame(price_data["Volume"]))

    return df

In [98]:
df = preprocess_data(price_data)
df

KeyError: "['Adj Close'] not found in axis"

In [93]:
# Save/Export processed dataframe
df.to_csv("data/processed_ETH-AUD",
              index=False)

In [103]:
df_train = df[ df.saleYear != 2020 ]
df_test = df[ df.saleYear == 2016 ]

# Split data into X & y, and split
X_train, y_train = df_train.drop("Close", axis=1), df_train.Close
X_test, y_test = df_test.drop("Close", axis=1), df_test.Close

In [110]:
# Create evaluation function (the competition uses RMSLE)
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score

# NOTE: root_mean_squared_log_error is added by me to scikit-learn, ordinarily we would need
# to use this function
def rmsle(y_test, y_preds):
    """
    Calculates root mean squared log error between predictions and
    true labels.
    """
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# Create function to evaluate model on a few different levels
def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_test)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
             "Test MAE": mean_absolute_error(y_test, val_preds),
             "Training RMSLE": rmsle(y_train, train_preds),
             "Test RMSEL": rmsle(y_test, val_preds),
             "Training R^2": r2_score(y_train, train_preds),
             "Test R^2": r2_score(y_test, val_preds)}
    
    return scores

In [112]:
%%time

# Change max samples in RandomForestRegressor
model = RandomForestRegressor(n_jobs=-1)

model.fit(X_train, y_train)

CPU times: user 567 ms, sys: 32.5 ms, total: 600 ms
Wall time: 269 ms


RandomForestRegressor(n_jobs=-1)

In [113]:
show_scores(model)

{'Training MAE': 3.402789096883211,
 'Test MAE': 0.09398622045454416,
 'Training RMSLE': 0.012843748967852767,
 'Test RMSEL': 0.011625957722585867,
 'Training R^2': 0.999493663984889,
 'Test R^2': 0.9828340086607836}

In [115]:
model.score(X_test, y_test)

# NOTE: This score is too high

0.9828340086607836