In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics

from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import numbers
from sklearn import model_selection


In [99]:
# Aquí, a futuro, habría que añadir los datos del Nasdaq, oro y/o Ethereum para ver los resultados
data_for_use = pd.read_csv('./bitcoin_data_for_use_v2.csv', index_col=0)



In [100]:
# Partimos los datos en train y test por fechas, sin que sea aleatorio, que es como funcionaría
# en la realidad el modelo
data_no_test, data_test = train_test_split(data_for_use, shuffle=False, test_size=0.2)

# Partimos los datos de train nuevamente en validation y train
data_train, data_valid = train_test_split(data_no_test, shuffle=False, test_size=0.2)

In [101]:
# Separamos la variable que queremos predecir
X_train=data_train.drop(labels=['subida'], axis=1)
y_train=data_train['subida']

X_valid=data_valid.drop(labels=['subida'], axis=1)
y_valid=data_valid['subida']

X_test=data_test.drop(labels=['subida'], axis=1)
y_test=data_test['subida']

In [102]:
# Miramos cuántos días sube el bitcoin sobre el total en cada uno de los segmentos
# esto nos servirá para hacernos una idea de cómo de buenos son los resultados de los modelos
y_train.sum()/y_train.count()

0.5686567164179105

In [103]:
y_valid.sum()/y_valid.count()

0.4851190476190476

In [104]:
y_test.sum()/y_test.count()

0.5513126491646778

In [105]:
#Tenemos que hacer one-hot encoding sobre la columna "dia"
#No parece que tenga sentido escalar el resto, cuando ya vienen calculadas en forma de diferencia relativa
#con el día/semana/mes anterior
#Puesto que únicamente estamos haciendo un one-hot sobre el día, lo hacemos sobre todo el dataset a la vez

number_columns = X_train.select_dtypes('number').columns

t=[('dia', 
    OneHotEncoder(handle_unknown='ignore'),
    ['dia'])
    #,('scaler', StandardScaler(),number_columns)
    ]

ct = ColumnTransformer(transformers=t, remainder='passthrough')

data_for_use_t=ct.fit_transform(data_for_use)

X_train=ct.fit_transform(X_train)
X_valid=ct.fit_transform(X_valid)
X_test=ct.fit_transform(X_test)

In [112]:
def walk_forward_validation (model, X_train_wfv, y_train_wfv, X_test_wfv, y_test_wfv):
    y_pred_wfv = list()
    for i in range(len(y_test_wfv)):
        model.fit(X_train_wfv, y_train_wfv)
        y_pred_next = model.predict(X_test_wfv[i:i+1])
        y_pred_wfv.append(y_pred_next[0])
        X_train_wfv=X_train_wfv.append(X_test_wfv[i:i+1])
        y_train_wfv=y_train_wfv.append(pd.Series(y_test_wfv[i]))
    
    return metrics.confusion_matrix(y_test_wfv, y_pred_wfv), metrics.f1_score(y_test_wfv,y_pred_wfv), metrics.accuracy_score(y_test_wfv,y_pred_wfv)
        
        

In [113]:
lr = LogisticRegression()

X_train_c=pd.DataFrame(X_train)
y_train_c=y_train.copy()
X_valid_c=pd.DataFrame(X_valid)
y_valid_c=y_valid.copy()


conf_mat, f1sc, accsc = walk_forward_validation(lr, X_train_c, y_train_c, X_valid_c, y_valid_c)

In [117]:
print(conf_mat, f1sc, accsc)

[[ 34 139]
 [ 32 131]] 0.605080831408776 0.49107142857142855


In [118]:
kn = KNeighborsClassifier(n_neighbors=10)

X_train_c=pd.DataFrame(X_train)
y_train_c=y_train.copy()
X_valid_c=pd.DataFrame(X_valid)
y_valid_c=y_valid.copy()


conf_mat, f1sc, accsc = walk_forward_validation(kn, X_train_c, y_train_c, X_valid_c, y_valid_c)

In [119]:
print(conf_mat, f1sc, accsc)

[[79 94]
 [76 87]] 0.5058139534883721 0.49404761904761907
