In [34]:
import pandas as pd
import numpy as np
from sklearn import metrics

from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier

import numbers
from sklearn import model_selection


In [35]:
# Aquí, a futuro, habría que añadir los datos del Nasdaq, oro y/o Ethereum para ver los resultados
data_for_use = pd.read_csv('./bitcoin_data_for_use_v2.csv', index_col=0)
data_gold = pd.read_csv('./gold_data_for_use.csv', index_col=0)
data_nasdaq = pd.read_csv('./nasdaq_data_for_use.csv', index_col=0)

data_for_use_w_gold = data_for_use.merge(data_gold, left_index=True, right_index=True)
data_for_use_w_nasdaq = data_for_use.merge(data_nasdaq, left_index=True, right_index=True)

data_for_use_w_all = data_for_use_w_gold.merge(data_nasdaq, left_index=True, right_index=True)





In [36]:
#aqui se marca lo que se va a usar de verdad para el resto del programa
data_for_use = data_for_use_w_all

In [37]:
# Partimos los datos en train y test por fechas, sin que sea aleatorio, que es como funcionaría
# en la realidad el modelo
data_no_test, data_test = train_test_split(data_for_use, shuffle=False, test_size=0.2)

# Partimos los datos de train nuevamente en validation y train
data_train, data_valid = train_test_split(data_no_test, shuffle=False, test_size=0.2)

In [38]:
# Separamos la variable que queremos predecir
X_train=data_train.drop(labels=['subida'], axis=1)
y_train=data_train['subida']

X_valid=data_valid.drop(labels=['subida'], axis=1)
y_valid=data_valid['subida']

X_test=data_test.drop(labels=['subida'], axis=1)
y_test=data_test['subida']

In [39]:
# Miramos cuántos días sube el bitcoin sobre el total en cada uno de los segmentos
# esto nos servirá para hacernos una idea de cómo de buenos son los resultados de los modelos
y_train.sum()/y_train.size

0.5683345780433159

In [40]:
y_valid.sum()/y_valid.size

0.48656716417910445

In [41]:
y_test.sum()/y_test.size

0.5513126491646778

In [42]:
#Tenemos que hacer one-hot encoding sobre la columna "dia"
#Sobre el resto aplicamos el standardscaler

number_columns = data_for_use.select_dtypes('number').columns

t=[('dia', 
    OneHotEncoder(handle_unknown='ignore'),
    ['dia'])
    ,('scaler', StandardScaler(),number_columns)
    ]

ct = ColumnTransformer(transformers=t, remainder='passthrough')

#data_for_use_t=ct.fit_transform(data_for_use)

#X_train=ct.fit_transform(X_train)
#X_valid=ct.transform(X_valid)
#X_test=ct.transform(X_test)

In [43]:
def walk_forward_validation (model, X_train_wfv, y_train_wfv, X_test_wfv, y_test_wfv):
    y_pred_wfv = list()
    for i in range(len(y_test_wfv)):
        X_train_wfv_ct=ct.fit_transform(X_train_wfv)
        model.fit(X_train_wfv_ct, y_train_wfv)
        X_test_wfv_ct=ct.transform(X_test_wfv)
        y_pred_next = model.predict(X_test_wfv_ct[i:i+1])
        y_pred_wfv.append(y_pred_next[0])
        X_train_wfv=X_train_wfv.append(X_test_wfv[i:i+1])
        y_train_wfv=y_train_wfv.append(pd.Series(y_test_wfv[i]))
    
    return metrics.confusion_matrix(y_test_wfv, y_pred_wfv), metrics.f1_score(y_test_wfv,y_pred_wfv), metrics.accuracy_score(y_test_wfv,y_pred_wfv)
        
        

In [13]:
lr = LogisticRegression(penalty='l2', solver='lbfgs')

X_train_c=X_train.copy()
y_train_c=y_train.copy()
X_valid_c=X_valid.copy()
y_valid_c=y_valid.copy()


conf_mat, f1sc, accsc = walk_forward_validation(lr, X_train_c, y_train_c, X_valid_c, y_valid_c)

In [14]:
print(conf_mat, f1sc, accsc)

[[99 73]
 [88 75]] 0.48231511254019294 0.5194029850746269


In [27]:
for vecinos in range(1,31):
    kn = KNeighborsClassifier(n_neighbors=vecinos)
    
    X_train_c=X_train.copy()
    y_train_c=y_train.copy()
    X_valid_c=X_valid.copy()
    y_valid_c=y_valid.copy()


    conf_mat, f1sc, accsc = walk_forward_validation(kn, X_train_c, y_train_c, X_valid_c, y_valid_c)
    print ("Numero de vecinos aplicado: " + str(vecinos))
    print(conf_mat, f1sc, accsc)

Numero de vecinos aplicado: 1
[[98 74]
 [86 77]] 0.49044585987261147 0.5223880597014925
Numero de vecinos aplicado: 2
[[134  38]
 [127  36]] 0.30379746835443044 0.5074626865671642
Numero de vecinos aplicado: 3
[[81 91]
 [80 83]] 0.4925816023738872 0.48955223880597015
Numero de vecinos aplicado: 4
[[111  61]
 [112  51]] 0.37090909090909097 0.4835820895522388
Numero de vecinos aplicado: 5
[[ 70 102]
 [ 71  92]] 0.5154061624649859 0.4835820895522388
Numero de vecinos aplicado: 6
[[94 78]
 [97 66]] 0.4299674267100977 0.47761194029850745
Numero de vecinos aplicado: 7
[[ 64 108]
 [ 76  87]] 0.48603351955307267 0.4507462686567164
Numero de vecinos aplicado: 8
[[80 92]
 [89 74]] 0.4498480243161094 0.4597014925373134
Numero de vecinos aplicado: 9
[[ 59 113]
 [ 63 100]] 0.5319148936170213 0.4746268656716418
Numero de vecinos aplicado: 10
[[78 94]
 [85 78]] 0.46567164179104475 0.46567164179104475
Numero de vecinos aplicado: 11
[[ 55 117]
 [ 63 100]] 0.5263157894736842 0.4626865671641791
Numero de

In [28]:
for grado in range(1,4):
    sv = SVC(probability=True, kernel='poly', degree=grado)
    
    X_train_c=X_train.copy()
    y_train_c=y_train.copy()
    X_valid_c=X_valid.copy()
    y_valid_c=y_valid.copy()


    conf_mat, f1sc, accsc = walk_forward_validation(sv, X_train_c, y_train_c, X_valid_c, y_valid_c)
    print ("Grado aplicado: " + str(grado))
    print(conf_mat, f1sc, accsc)

Grado aplicado: 1
[[ 56 116]
 [ 55 108]] 0.5581395348837208 0.48955223880597015
Grado aplicado: 2
[[ 33 139]
 [ 40 123]] 0.5788235294117647 0.46567164179104475
Grado aplicado: 3
[[ 21 151]
 [ 18 145]] 0.6318082788671023 0.4955223880597015


In [29]:
for profund in range(1,31):
    dt = DecisionTreeClassifier(max_depth=profund)
    
    X_train_c=X_train.copy()
    y_train_c=y_train.copy()
    X_valid_c=X_valid.copy()
    y_valid_c=y_valid.copy()


    conf_mat, f1sc, accsc = walk_forward_validation(dt, X_train_c, y_train_c, X_valid_c, y_valid_c)
    print ("Maxima profundidad aplicada: " + str(profund))
    print(conf_mat, f1sc, accsc)

Maxima profundidad aplicada: 1
[[  4 168]
 [  6 157]] 0.6434426229508197 0.48059701492537316
Maxima profundidad aplicada: 2
[[  4 168]
 [  9 154]] 0.6350515463917525 0.4716417910447761
Maxima profundidad aplicada: 3
[[ 13 159]
 [ 18 145]] 0.6209850107066381 0.4716417910447761
Maxima profundidad aplicada: 4
[[ 25 147]
 [ 27 136]] 0.6098654708520179 0.48059701492537316
Maxima profundidad aplicada: 5
[[ 49 123]
 [ 45 118]] 0.5841584158415841 0.49850746268656715
Maxima profundidad aplicada: 6
[[ 40 132]
 [ 43 120]] 0.5783132530120482 0.47761194029850745
Maxima profundidad aplicada: 7
[[ 52 120]
 [ 61 102]] 0.5298701298701299 0.4597014925373134
Maxima profundidad aplicada: 8
[[ 65 107]
 [ 71  92]] 0.5082872928176795 0.46865671641791046
Maxima profundidad aplicada: 9
[[ 55 117]
 [ 71  92]] 0.4946236559139785 0.4388059701492537
Maxima profundidad aplicada: 10
[[ 59 113]
 [ 70  93]] 0.5040650406504066 0.4537313432835821
Maxima profundidad aplicada: 11
[[ 72 100]
 [ 77  86]] 0.49283667621776506

In [45]:
for estimadores in range(1,21):
    adab = AdaBoostClassifier(n_estimators=estimadores)
    
    X_train_c=X_train.copy()
    y_train_c=y_train.copy()
    X_valid_c=X_valid.copy()
    y_valid_c=y_valid.copy()


    conf_mat, f1sc, accsc = walk_forward_validation(adab, X_train_c, y_train_c, X_valid_c, y_valid_c)
    print ("Estimadores aplicados: " + str(estimadores))
    print(conf_mat, f1sc, accsc)

Estimadores aplicados: 1
[[  4 168]
 [  6 157]] 0.6434426229508197 0.48059701492537316
Estimadores aplicados: 2
[[  7 165]
 [ 10 153]] 0.6361746361746361 0.47761194029850745
Estimadores aplicados: 3
[[ 16 156]
 [ 16 147]] 0.630901287553648 0.48656716417910445
Estimadores aplicados: 4
[[ 16 156]
 [ 15 148]] 0.6338329764453962 0.48955223880597015
Estimadores aplicados: 5
[[ 26 146]
 [ 28 135]] 0.6081081081081081 0.48059701492537316
Estimadores aplicados: 6
[[ 43 129]
 [ 41 122]] 0.5893719806763285 0.4925373134328358
Estimadores aplicados: 7
[[ 46 126]
 [ 45 118]] 0.5798525798525799 0.48955223880597015
Estimadores aplicados: 8
[[ 53 119]
 [ 57 106]] 0.5463917525773195 0.4746268656716418
Estimadores aplicados: 9
[[ 53 119]
 [ 52 111]] 0.5648854961832062 0.48955223880597015
Estimadores aplicados: 10
[[ 59 113]
 [ 56 107]] 0.5587467362924282 0.4955223880597015
Estimadores aplicados: 11
[[ 51 121]
 [ 54 109]] 0.55470737913486 0.47761194029850745
Estimadores aplicados: 12
[[ 56 116]
 [ 53 110]

In [31]:
dt = DecisionTreeClassifier(max_depth=13)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)

X_train_c=ct.fit_transform(X_train_c)
dt.fit(X_train_c,y_train_c)
y_predict=dt.predict(X_train_c)

(metrics.confusion_matrix(y_train_c, y_predict), metrics.f1_score(y_train_c,y_predict), metrics.accuracy_score(y_train_c,y_predict))

(array([[609, 141],
        [ 69, 855]], dtype=int64),
 0.890625,
 0.8745519713261649)

In [32]:
dt = DecisionTreeClassifier(max_depth=18)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)

X_train_c=ct.fit_transform(X_train_c)
dt.fit(X_train_c,y_train_c)
y_predict=dt.predict(X_train_c)

(metrics.confusion_matrix(y_train_c, y_predict), metrics.f1_score(y_train_c,y_predict), metrics.accuracy_score(y_train_c,y_predict))

(array([[743,   7],
        [ 11, 913]], dtype=int64),
 0.990238611713666,
 0.989247311827957)

In [29]:
kn = KNeighborsClassifier(n_neighbors=1)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)

X_train_c=ct.fit_transform(X_train_c)
kn.fit(X_train_c,y_train_c)
y_predict=kn.predict(X_train_c)

(metrics.confusion_matrix(y_train_c, y_predict), metrics.f1_score(y_train_c,y_predict), metrics.accuracy_score(y_train_c,y_predict))

(array([[750,   0],
        [  0, 924]]),
 1.0,
 1.0)

In [30]:
lr = LogisticRegression(penalty='l2', solver='lbfgs')

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)

X_train_c=ct.fit_transform(X_train_c)
lr.fit(X_train_c,y_train_c)
y_predict=lr.predict(X_train_c)

(metrics.confusion_matrix(y_train_c, y_predict), metrics.f1_score(y_train_c,y_predict), metrics.accuracy_score(y_train_c,y_predict))

(array([[367, 383],
        [223, 701]]),
 0.6982071713147411,
 0.6379928315412187)

In [44]:
#Este es el código para ver cómo funciona el modelo de verdad

kn = KNeighborsClassifier(n_neighbors=1)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)
X_test_c=X_test.copy()
y_test_c=y_test.copy()

conf_mat, f1sc, accsc = walk_forward_validation(kn, X_train_c, y_train_c, X_test_c, y_test_c)

print(conf_mat, f1sc, accsc)

[[ 77 111]
 [ 95 136]] 0.5690376569037658 0.5083532219570406


In [46]:
#Este es el código para ver cómo funciona el modelo de verdad

adab = AdaBoostClassifier(n_estimators=16)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)
X_test_c=X_test.copy()
y_test_c=y_test.copy()

conf_mat, f1sc, accsc = walk_forward_validation(adab, X_train_c, y_train_c, X_test_c, y_test_c)

print(conf_mat, f1sc, accsc)

[[ 58 130]
 [ 74 157]] 0.6061776061776062 0.513126491646778
