In [373]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import TimeSeriesSplit
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [374]:
csv_file = 'BTC_USD Bitfinex Historical Data-2.csv'
df = pd.read_csv(csv_file, thousands=',')
df = df[['Price', 'Open', 'High', 'Low', 'Change %']]
df['Change %'] = [x[:-1] for x in df['Change %']]

df = pd.DataFrame(df)

training_data = df.iloc[1:42]
testing_data = df.iloc[42:]

In [375]:
X1 = training_data[['Price', 'Open', 'High', 'Low']]
y1 = training_data['Change %']

X2 = testing_data[['Price', 'Open', 'High', 'Low']]
y2 = testing_data['Change %']

In [393]:
def model_tester(model, X1, y1, X2, y2):
    
    tuned_model = model(X1, y1)
    y_preds = tuned_model.predict(X2)
    mae = mean_absolute_error(y_preds, y2)
    
    return mae

In [394]:
def decision_tree_mae(train_X, test_X, train_y, test_y, max_leaf_nodes):
    model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, 
                                  random_state=0)
    model.fit(train_X, train_y)
    preds_y = model.predict(test_X)
    mae = mean_absolute_error(test_y, preds_y)
    return mae


def decision_tree_crossval_mae(data_X, data_y, max_leaf_nodes, no_of_iterations):
    
    n = no_of_iterations
    
    list_of_maes = []
    
    for i in range(0, n):
        X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, 
                                                            test_size=0.4, random_state=i)
        mae = decision_tree_mae(X_train, X_test, y_train, y_test, max_leaf_nodes)
        list_of_maes.append(mae)
    ret = sum(list_of_maes) / len(list_of_maes)
    
    return ret


def best_max_leaf_nodes(n, data_X, data_y):
    x =[]
    for i in range(2, n):
        x.append(decision_tree_crossval_mae(data_X, data_y, i, 25))
    m = min(x)
    ret = [i for i, j in enumerate(x) if j == m]
    ret = min(ret) + 2
    print("max leaf nodes:", ret)
    return ret


def decision_tree_model(data_X, data_y):
    
    max_leaf_nodes = best_max_leaf_nodes(25, data_X, data_y)
    model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, 
                                  random_state=0)
    model.fit(data_X, data_y)
    
    return model




# why do all this? It prevents overfitting.

In [398]:
def random_forest_mae(train_X, test_X, train_y, test_y, max_leaf_nodes):
    model = RandomForestRegressor(max_leaf_nodes = max_leaf_nodes, 
                                  random_state=0)
    model.fit(train_X, train_y)
    preds_y = model.predict(test_X)
    mae = mean_absolute_error(test_y, preds_y)
    return mae


def random_forest_crossval_mae(data_X, data_y, max_leaf_nodes, no_of_iterations):
    
    n = no_of_iterations
    
    list_of_maes = []
    
    for i in range(0, n):
        X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, 
                                                            test_size=0.4, random_state=i)
        mae = random_forest_mae(X_train, X_test, y_train, y_test, max_leaf_nodes)
        list_of_maes.append(mae)
    ret = sum(list_of_maes) / len(list_of_maes)
    
    return ret


def best_max_leaf_nodes_2(n, data_X, data_y):
    x =[]
    for i in range(2, n):
        x.append(random_forest_crossval_mae(data_X, data_y, i, 25))
    m = min(x)
    ret = [i for i, j in enumerate(x) if j == m]
    ret = min(ret) + 2
    print("max leaf nodes:", ret)
    return ret


def random_forest_model(data_X, data_y):
    
    max_leaf_nodes = best_max_leaf_nodes_2(25, data_X, data_y)
    model = RandomForestRegressor(max_leaf_nodes = max_leaf_nodes, 
                                  random_state=0)
    model.fit(data_X, data_y)
    
    return model


In [399]:
model_tester(random_forest_model, X1, y1, X2, y2)

('max leaf nodes:', 11)


2.7428310998180567

In [397]:
model_tester(decision_tree_model, X1, y1, X2, y2)

('max leaf nodes:', 4)


3.9083218766111019

In [401]:
model = random_forest_model(X1, y1)

('max leaf nodes:', 11)


In [410]:
y_preds = model.predict(X2)
y = sum(y2)

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [403]:
y_preds

array([ 3.95310833,  4.18854167, -0.2531    ,  2.683375  ,  3.77394167,
        2.364275  ,  3.55620833, -0.20930238, -1.15351071,  0.64288214,
       -0.48336786,  1.97691587,  1.24957659,  1.45907659,  2.53491587,
        2.70977183,  3.30296429, -0.20930238,  1.37092262,  0.96563095,
       -3.99220833,  1.41891667, -2.98976407])