# Ethereum Price Classification
This is a notebook that aims to predict whether the price of one Eth the next day will be higher or lower than the previous day given 4 years of daily price data.

Note that this largely an extension of the Ethereum Price Regression nb.

Data can be source from Yahoo at this link: https://au.finance.yahoo.com/quote/ETH-AUD/history?period1=1541376000&period2=1604534400&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true, between the periods of 18/11/2016 to 05/11/2020.

In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Models from SCikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [25]:
# Import preprocessed data (if it exists)
df = pd.read_csv("data/processed_ETH-AUD",
                    low_memory=False)
df.head();

In [58]:
# Add a column for whether the model closes higher than opens
df["CloseHigher"] = df.apply(lambda row: 
                                         1 if row["Open"] < row["Close"] else 0, 
                                         axis=1)

# Add a column for whether the next day closes higher than the previous day open
df["CloseHigherNextDay"] = 0

for i in  range(1, len(df)):
    if df.iloc[i]["Close"] > df.iloc[i-1]["Open"]:
        df.at[i, 'CloseHigherNextDay'] = 1
    else:
        df.at[i, 'CloseHigherNextDay'] = 0
    

# df["NextDayHigher"].value_counts()
df

Unnamed: 0,Open,High,Low,Close,Volume,saleYear,saleMonth,saleDay,saleDayOfWeek,saleDayOfYear,CloseHigher,CloseHigherNextDay
0,13.501419,13.527241,12.809084,12.901511,0.222003,2016,11,18,4,323,0,0
1,12.910578,13.268645,12.909572,13.155561,0.051970,2016,11,19,5,324,1,0
2,13.153451,13.221361,13.009252,13.079301,0.052022,2016,11,20,6,325,0,1
3,13.080503,13.157393,12.946662,13.046948,0.005525,2016,11,21,0,326,0,0
4,13.053648,13.856814,12.990789,13.371652,0.233249,2016,11,22,1,327,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1457,652.833923,653.569824,623.068298,632.921814,314.558834,2020,11,14,5,319,0,0
1458,632.921936,634.083069,605.527161,613.828186,313.547229,2020,11,15,6,320,0,0
1459,613.828064,633.913269,611.277710,628.373657,346.664635,2020,11,16,0,321,1,0
1460,634.475647,660.155151,629.372925,658.429077,443.646459,2020,11,17,1,322,1,1


In [60]:
np.random.seed(7)

# Split data
X = df.drop(["CloseHigherNextDay"], axis=1)
y = df["CloseHigherNextDay"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False, stratify=None)

X

Unnamed: 0,Open,High,Low,Close,Volume,saleYear,saleMonth,saleDay,saleDayOfWeek,saleDayOfYear,CloseHigher
0,13.501419,13.527241,12.809084,12.901511,0.222003,2016,11,18,4,323,0
1,12.910578,13.268645,12.909572,13.155561,0.051970,2016,11,19,5,324,1
2,13.153451,13.221361,13.009252,13.079301,0.052022,2016,11,20,6,325,0
3,13.080503,13.157393,12.946662,13.046948,0.005525,2016,11,21,0,326,0
4,13.053648,13.856814,12.990789,13.371652,0.233249,2016,11,22,1,327,1
...,...,...,...,...,...,...,...,...,...,...,...
1457,652.833923,653.569824,623.068298,632.921814,314.558834,2020,11,14,5,319,0
1458,632.921936,634.083069,605.527161,613.828186,313.547229,2020,11,15,6,320,0
1459,613.828064,633.913269,611.277710,628.373657,346.664635,2020,11,16,0,321,1
1460,634.475647,660.155151,629.372925,658.429077,443.646459,2020,11,17,1,322,1


In [61]:
%%time

# Put models in a dictionary
models = {"Logistic Regression": LogisticRegression(),
          "KNN": KNeighborsClassifier(),
          "Random Forest": RandomForestClassifier(),
          "Linear-SVC": SVC(kernel="linear", C=0.025),
         }

# Create a function that both fits and scores our models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and scores ML models.
    models: a dict of Sklearn models
    X_train: training data
    X_test: test data
    y_train: labels for training data
    y_test: labels for test data
    """
    
    np.random.seed(7)
    # Set up empty dict for our model scores
    model_scores = {}
    fitted_models = []
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name] = model.score(X_test, y_test)
        fitted_models.append(model)
        
    return [model_scores, fitted_models]

CPU times: user 90 µs, sys: 2 µs, total: 92 µs
Wall time: 94.2 µs


In [62]:
model_scores = fit_and_score(models=models,
                             X_train=X_train,
                             X_test=X_test,
                             y_train=y_train,
                             y_test=y_test);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [63]:
model_scores

[{'Logistic Regression': 0.6996587030716723,
  'KNN': 0.5460750853242321,
  'Random Forest': 0.6655290102389079,
  'Linear-SVC': 0.7201365187713311},
 [LogisticRegression(),
  KNeighborsClassifier(),
  RandomForestClassifier(),
  SVC(C=0.025, kernel='linear')]]