In [22]:
import pandas as pd
import numpy as np
from sklearn import metrics

from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier

import numbers
from sklearn import model_selection


In [23]:
# Aquí, a futuro, habría que añadir los datos del Nasdaq, oro y/o Ethereum para ver los resultados
data_for_use = pd.read_csv('./bitcoin_data_for_use_v2.csv', index_col=0)
data_gold = pd.read_csv('./gold_data_for_use.csv', index_col=0)
data_nasdaq = pd.read_csv('./nasdaq_data_for_use.csv', index_col=0)

data_for_use_w_gold = data_for_use.merge(data_gold, left_index=True, right_index=True)
data_for_use_w_nasdaq = data_for_use.merge(data_nasdaq, left_index=True, right_index=True)

data_for_use_w_all = data_for_use_w_gold.merge(data_nasdaq, left_index=True, right_index=True)





In [24]:
#aqui se marca lo que se va a usar de verdad para el resto del programa
data_for_use = data_for_use_w_nasdaq

In [25]:
# Partimos los datos en train y test por fechas, sin que sea aleatorio, que es como funcionaría
# en la realidad el modelo
data_no_test, data_test = train_test_split(data_for_use, shuffle=False, test_size=0.2)

# Partimos los datos de train nuevamente en validation y train
data_train, data_valid = train_test_split(data_no_test, shuffle=False, test_size=0.2)

In [26]:
# Separamos la variable que queremos predecir
X_train=data_train.drop(labels=['subida'], axis=1)
y_train=data_train['subida']

X_valid=data_valid.drop(labels=['subida'], axis=1)
y_valid=data_valid['subida']

X_test=data_test.drop(labels=['subida'], axis=1)
y_test=data_test['subida']

In [27]:
# Miramos cuántos días sube el bitcoin sobre el total en cada uno de los segmentos
# esto nos servirá para hacernos una idea de cómo de buenos son los resultados de los modelos
y_train.sum()/y_train.size

0.5683345780433159

In [28]:
y_valid.sum()/y_valid.size

0.48656716417910445

In [29]:
y_test.sum()/y_test.size

0.5513126491646778

In [30]:
#Tenemos que hacer one-hot encoding sobre la columna "dia"
#Sobre el resto aplicamos el standardscaler

number_columns = data_for_use.select_dtypes('number').columns

t=[('dia', 
    OneHotEncoder(handle_unknown='ignore'),
    ['dia'])
    ,('scaler', StandardScaler(),number_columns)
    ]

ct = ColumnTransformer(transformers=t, remainder='passthrough')

#data_for_use_t=ct.fit_transform(data_for_use)

#X_train=ct.fit_transform(X_train)
#X_valid=ct.transform(X_valid)
#X_test=ct.transform(X_test)

In [31]:
def walk_forward_validation (model, X_train_wfv, y_train_wfv, X_test_wfv, y_test_wfv):
    y_pred_wfv = list()
    for i in range(len(y_test_wfv)):
        X_train_wfv_ct=ct.fit_transform(X_train_wfv)
        model.fit(X_train_wfv_ct, y_train_wfv)
        X_test_wfv_ct=ct.transform(X_test_wfv)
        y_pred_next = model.predict(X_test_wfv_ct[i:i+1])
        y_pred_wfv.append(y_pred_next[0])
        X_train_wfv=X_train_wfv.append(X_test_wfv[i:i+1])
        y_train_wfv=y_train_wfv.append(pd.Series(y_test_wfv[i]))
    
    return metrics.confusion_matrix(y_test_wfv, y_pred_wfv), metrics.f1_score(y_test_wfv,y_pred_wfv), metrics.accuracy_score(y_test_wfv,y_pred_wfv)
        
        

In [11]:
lr = LogisticRegression(penalty='l2', solver='lbfgs')

X_train_c=X_train.copy()
y_train_c=y_train.copy()
X_valid_c=X_valid.copy()
y_valid_c=y_valid.copy()


conf_mat, f1sc, accsc = walk_forward_validation(lr, X_train_c, y_train_c, X_valid_c, y_valid_c)

In [12]:
print(conf_mat, f1sc, accsc)

[[ 55 117]
 [ 44 119]] 0.5964912280701754 0.5194029850746269


In [13]:
for vecinos in range(1,31):
    kn = KNeighborsClassifier(n_neighbors=vecinos)
    
    X_train_c=X_train.copy()
    y_train_c=y_train.copy()
    X_valid_c=X_valid.copy()
    y_valid_c=y_valid.copy()


    conf_mat, f1sc, accsc = walk_forward_validation(kn, X_train_c, y_train_c, X_valid_c, y_valid_c)
    print ("Numero de vecinos aplicado: " + str(vecinos))
    print(conf_mat, f1sc, accsc)

Numero de vecinos aplicado: 1
[[80 92]
 [83 80]] 0.47761194029850745 0.47761194029850745
Numero de vecinos aplicado: 2
[[132  40]
 [117  46]] 0.36947791164658633 0.5313432835820896
Numero de vecinos aplicado: 3
[[73 99]
 [68 95]] 0.5322128851540616 0.5014925373134328
Numero de vecinos aplicado: 4
[[102  70]
 [ 99  64]] 0.430976430976431 0.4955223880597015
Numero de vecinos aplicado: 5
[[ 67 105]
 [ 71  92]] 0.5111111111111111 0.4746268656716418
Numero de vecinos aplicado: 6
[[86 86]
 [86 77]] 0.4723926380368098 0.48656716417910445
Numero de vecinos aplicado: 7
[[ 56 116]
 [ 66  97]] 0.5159574468085106 0.45671641791044776
Numero de vecinos aplicado: 8
[[76 96]
 [81 82]] 0.4809384164222874 0.4716417910447761
Numero de vecinos aplicado: 9
[[ 49 123]
 [ 55 108]] 0.5482233502538072 0.46865671641791046
Numero de vecinos aplicado: 10
[[ 71 101]
 [ 70  93]] 0.5210084033613445 0.48955223880597015
Numero de vecinos aplicado: 11
[[ 46 126]
 [ 45 118]] 0.5798525798525799 0.48955223880597015
Numero

In [15]:
for grado in range(1,4):
    sv = SVC(probability=True, kernel='poly', degree=grado)
    
    X_train_c=X_train.copy()
    y_train_c=y_train.copy()
    X_valid_c=X_valid.copy()
    y_valid_c=y_valid.copy()


    conf_mat, f1sc, accsc = walk_forward_validation(sv, X_train_c, y_train_c, X_valid_c, y_valid_c)
    print ("Grado aplicado: " + str(grado))
    print(conf_mat, f1sc, accsc)

Grado aplicado: 1
[[ 43 129]
 [ 33 130]] 0.6161137440758293 0.5164179104477612
Grado aplicado: 2
[[ 27 145]
 [ 41 122]] 0.5674418604651164 0.44477611940298506
Grado aplicado: 3
[[ 15 157]
 [ 22 141]] 0.6117136659436009 0.46567164179104475


In [16]:
for profund in range(1,31):
    dt = DecisionTreeClassifier(max_depth=profund)
    
    X_train_c=X_train.copy()
    y_train_c=y_train.copy()
    X_valid_c=X_valid.copy()
    y_valid_c=y_valid.copy()


    conf_mat, f1sc, accsc = walk_forward_validation(dt, X_train_c, y_train_c, X_valid_c, y_valid_c)
    print ("Maxima profundidad aplicada: " + str(profund))
    print(conf_mat, f1sc, accsc)

Maxima profundidad aplicada: 1
[[  4 168]
 [  6 157]] 0.6434426229508197 0.48059701492537316
Maxima profundidad aplicada: 2
[[  4 168]
 [  9 154]] 0.6350515463917525 0.4716417910447761
Maxima profundidad aplicada: 3
[[ 16 156]
 [ 15 148]] 0.6338329764453962 0.48955223880597015
Maxima profundidad aplicada: 4
[[ 33 139]
 [ 33 130]] 0.601851851851852 0.48656716417910445
Maxima profundidad aplicada: 5
[[ 32 140]
 [ 40 123]] 0.5774647887323944 0.4626865671641791
Maxima profundidad aplicada: 6
[[ 60 112]
 [ 52 111]] 0.5751295336787564 0.5104477611940299
Maxima profundidad aplicada: 7
[[ 43 129]
 [ 51 112]] 0.5544554455445544 0.4626865671641791
Maxima profundidad aplicada: 8
[[ 71 101]
 [ 65  98]] 0.5414364640883977 0.5044776119402985
Maxima profundidad aplicada: 9
[[ 70 102]
 [ 64  99]] 0.543956043956044 0.5044776119402985
Maxima profundidad aplicada: 10
[[ 76  96]
 [ 61 102]] 0.5650969529085873 0.5313432835820896
Maxima profundidad aplicada: 11
[[ 71 101]
 [ 72  91]] 0.512676056338028 0.483

In [17]:
for estimadores in range(1,21):
    adab = AdaBoostClassifier(n_estimators=estimadores)
    
    X_train_c=X_train.copy()
    y_train_c=y_train.copy()
    X_valid_c=X_valid.copy()
    y_valid_c=y_valid.copy()


    conf_mat, f1sc, accsc = walk_forward_validation(adab, X_train_c, y_train_c, X_valid_c, y_valid_c)
    print ("Estimadores aplicados: " + str(estimadores))
    print(conf_mat, f1sc, accsc)

Estimadores aplicados: 1
[[  4 168]
 [  6 157]] 0.6434426229508197 0.48059701492537316
Estimadores aplicados: 2
[[  7 165]
 [ 10 153]] 0.6361746361746361 0.47761194029850745
Estimadores aplicados: 3
[[ 15 157]
 [ 15 148]] 0.6324786324786325 0.48656716417910445
Estimadores aplicados: 4
[[ 17 155]
 [ 18 145]] 0.6263498920086392 0.4835820895522388
Estimadores aplicados: 5
[[ 31 141]
 [ 30 133]] 0.608695652173913 0.48955223880597015
Estimadores aplicados: 6
[[ 47 125]
 [ 42 121]] 0.5916870415647922 0.5014925373134328
Estimadores aplicados: 7
[[ 50 122]
 [ 48 115]] 0.5750000000000001 0.4925373134328358
Estimadores aplicados: 8
[[ 51 121]
 [ 54 109]] 0.55470737913486 0.47761194029850745
Estimadores aplicados: 9
[[ 54 118]
 [ 52 111]] 0.5663265306122448 0.4925373134328358
Estimadores aplicados: 10
[[ 59 113]
 [ 57 106]] 0.5549738219895287 0.4925373134328358
Estimadores aplicados: 11
[[ 58 114]
 [ 57 106]] 0.5535248041775458 0.48955223880597015
Estimadores aplicados: 12
[[ 64 108]
 [ 64  99]] 

In [18]:
sv = SVC(probability=True, kernel='poly', degree=1)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)

X_train_c=ct.fit_transform(X_train_c)
sv.fit(X_train_c,y_train_c)
y_predict=sv.predict(X_train_c)

(metrics.confusion_matrix(y_train_c, y_predict), metrics.f1_score(y_train_c,y_predict), metrics.accuracy_score(y_train_c,y_predict))

(array([[233, 517],
        [137, 787]]),
 0.7064631956912029,
 0.6093189964157706)

In [14]:
dt = DecisionTreeClassifier(max_depth=10)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)

X_train_c=ct.fit_transform(X_train_c)
dt.fit(X_train_c,y_train_c)
y_predict=dt.predict(X_train_c)

(metrics.confusion_matrix(y_train_c, y_predict), metrics.f1_score(y_train_c,y_predict), metrics.accuracy_score(y_train_c,y_predict))

(array([[490, 260],
        [148, 776]], dtype=int64),
 0.7918367346938775,
 0.7562724014336918)

In [15]:
dt = DecisionTreeClassifier(max_depth=23)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)

X_train_c=ct.fit_transform(X_train_c)
dt.fit(X_train_c,y_train_c)
y_predict=dt.predict(X_train_c)

(metrics.confusion_matrix(y_train_c, y_predict), metrics.f1_score(y_train_c,y_predict), metrics.accuracy_score(y_train_c,y_predict))

(array([[750,   0],
        [  0, 924]], dtype=int64),
 1.0,
 1.0)

In [16]:
kn = KNeighborsClassifier(n_neighbors=2)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)

X_train_c=ct.fit_transform(X_train_c)
kn.fit(X_train_c,y_train_c)
y_predict=kn.predict(X_train_c)

(metrics.confusion_matrix(y_train_c, y_predict), metrics.f1_score(y_train_c,y_predict), metrics.accuracy_score(y_train_c,y_predict))

(array([[750,   0],
        [359, 565]], dtype=int64),
 0.7588985896574882,
 0.7855436081242533)

In [17]:
adab = AdaBoostClassifier(n_estimators=18)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)

X_train_c=ct.fit_transform(X_train_c)
adab.fit(X_train_c,y_train_c)
y_predict=adab.predict(X_train_c)

(metrics.confusion_matrix(y_train_c, y_predict), metrics.f1_score(y_train_c,y_predict), metrics.accuracy_score(y_train_c,y_predict))

(array([[321, 429],
        [187, 737]], dtype=int64),
 0.7052631578947369,
 0.6320191158900836)

In [32]:
#Este es el código para ver cómo funciona el modelo de verdad

dt = DecisionTreeClassifier(max_depth=10)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)
X_test_c=X_test.copy()
y_test_c=y_test.copy()

conf_mat, f1sc, accsc = walk_forward_validation(dt, X_train_c, y_train_c, X_test_c, y_test_c)

print(conf_mat, f1sc, accsc)

[[ 70 118]
 [ 88 143]] 0.5813008130081301 0.5083532219570406
