In [18]:
import pandas as pd
import numpy as np
from sklearn import metrics

from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier

import numbers
from sklearn import model_selection


In [19]:
#We are loading the bitcoin data, but also gold and nasdaq, in case we want to use them
data_for_use = pd.read_csv('./bitcoin_data_for_use_v3.csv', index_col=0)
data_gold = pd.read_csv('./gold_data_for_use.csv', index_col=0)
data_nasdaq = pd.read_csv('./nasdaq_data_for_use.csv', index_col=0)

data_for_use_w_gold = data_for_use.merge(data_gold, left_index=True, right_index=True)
data_for_use_w_nasdaq = data_for_use.merge(data_nasdaq, left_index=True, right_index=True)

data_for_use_w_all = data_for_use_w_gold.merge(data_nasdaq, left_index=True, right_index=True)

data_for_use_basic = data_for_use.drop(labels=['dia',
                    'varV0','varV1','varV2','varV3','varV4','varV5','varV6','varV7','varV8','varV9',
                    'varV10','varV11','varV12','varV13','varV14','varV15','varV16','varV17','varV18','varV19',
                    'varV20','varV21','varV22','varV23','varV24','varV25','varV26','varV27','varV28','varV29'
                    ], axis=1)


data_for_use_bone_deep = data_for_use_basic.drop(labels=['varPm93',
                    'varPm123','varPm153','varPm183','varPm213','varPm243','varPm273','varPm303','varPm333',
                    'varPs30','varPs37','varPs44','varPs51','varPs58','varPs65','varPs72','varPs79','varPs86'
                    ], axis=1)


In [20]:
data_for_use_basic

Unnamed: 0,varP0,varP1,varP2,varP3,varP4,varP5,varP6,varP7,varP8,varP9,...,varPm123,varPm153,varPm183,varPm213,varPm243,varPm273,varPm303,varPm333,outlier,subida
2016-01-01,0.008749,0.009252,-0.014696,0.025348,-0.001287,0.013298,-0.084229,0.001468,0.028445,0.013352,...,-0.183043,0.102536,0.131128,-0.060556,-0.054907,-0.068731,0.146342,-0.152457,False,False
2016-01-02,-0.002063,0.008749,0.009252,-0.014696,0.025348,-0.001287,0.013298,-0.084229,0.001468,0.028445,...,-0.192818,0.102514,0.134863,-0.054992,-0.057860,-0.081400,0.215208,-0.139772,False,False
2016-01-03,-0.007907,-0.002063,0.008749,0.009252,-0.014696,0.025348,-0.001287,0.013298,-0.084229,0.001468,...,-0.184701,0.077969,0.162987,-0.049962,-0.093926,-0.044459,0.202201,-0.173499,False,True
2016-01-04,0.007163,-0.007907,-0.002063,0.008749,0.009252,-0.014696,0.025348,-0.001287,0.013298,-0.084229,...,-0.203476,0.048931,0.208760,-0.021020,-0.100629,-0.075179,0.272441,-0.241372,False,False
2016-01-05,-0.002611,0.007163,-0.007907,-0.002063,0.008749,0.009252,-0.014696,0.025348,-0.001287,0.013298,...,-0.182999,0.047772,0.192408,-0.049361,-0.062588,-0.077178,0.234350,-0.244859,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-21,-0.050185,-0.093449,-0.021089,0.021386,-0.010795,-0.008157,0.023015,0.047359,-0.023884,0.019066,...,-0.307969,-0.011426,-0.027993,0.819894,0.296122,0.294667,0.420558,0.264591,False,True
2021-09-22,0.070793,-0.050185,-0.093449,-0.021089,0.021386,-0.010795,-0.008157,0.023015,0.047359,-0.023884,...,-0.274826,-0.054379,-0.048679,0.743330,0.420131,0.265584,0.400979,0.219860,False,True
2021-09-23,0.030306,0.070793,-0.050185,-0.093449,-0.021089,0.021386,-0.010795,-0.008157,0.023015,0.047359,...,-0.319474,-0.031845,-0.026437,0.690406,0.351016,0.242234,0.466289,0.217606,False,False
2021-09-24,-0.045781,0.030306,0.070793,-0.050185,-0.093449,-0.021089,0.021386,-0.010795,-0.008157,0.023015,...,-0.226667,-0.031976,0.058981,0.512089,0.309128,0.316711,0.432640,0.215800,False,False


In [21]:
#aqui se marca lo que se va a usar de verdad para el resto del programa
data_for_use = data_for_use_basic

In [22]:
# Partimos los datos en train y test por fechas, sin que sea aleatorio, que es como funcionaría
# en la realidad el modelo
data_no_test, data_test = train_test_split(data_for_use, shuffle=False, test_size=0.2)

# Partimos los datos de train nuevamente en validation y train
data_train, data_valid = train_test_split(data_no_test, shuffle=False, test_size=0.2)

In [23]:
# Separamos la variable que queremos predecir
data_train = data_train[data_train['outlier']==0]
X_train=data_train.drop(labels=['subida','outlier'], axis=1)
y_train=data_train['subida']

X_valid=data_valid.drop(labels=['subida','outlier'], axis=1)
y_valid=data_valid['subida']

X_test=data_test.drop(labels=['subida','outlier'], axis=1)
y_test=data_test['subida']

In [24]:
# Miramos cuántos días sube el bitcoin sobre el total en cada uno de los segmentos
# esto nos servirá para hacernos una idea de cómo de buenos son los resultados de los modelos
y_train.sum()/y_train.size

0.5684210526315789

In [25]:
y_valid.sum()/y_valid.size

0.4851190476190476

In [26]:
y_test.sum()/y_test.size

0.5513126491646778

In [27]:
#Tenemos que hacer one-hot encoding sobre la columna "dia"
#Sobre el resto aplicamos el standardscaler

number_columns = data_for_use.select_dtypes('number').columns

t=[
    #('dia', 
    #OneHotEncoder(handle_unknown='ignore'),
    #['dia']),
    ('scaler', StandardScaler(),number_columns)
    ]

ct = ColumnTransformer(transformers=t, remainder='passthrough')

#data_for_use_t=ct.fit_transform(data_for_use)

#X_train=ct.fit_transform(X_train)
#X_valid=ct.transform(X_valid)
#X_test=ct.transform(X_test)

In [28]:
def walk_forward_validation (model, X_train_wfv, y_train_wfv, X_test_wfv, y_test_wfv):
    y_pred_wfv = list()
    for i in range(len(y_test_wfv)):
        X_train_wfv_ct=ct.fit_transform(X_train_wfv)
        model.fit(X_train_wfv_ct, y_train_wfv)
        X_test_wfv_ct=ct.transform(X_test_wfv)
        y_pred_next = model.predict(X_test_wfv_ct[i:i+1])
        y_pred_wfv.append(y_pred_next[0])
        X_train_wfv=X_train_wfv.append(X_test_wfv[i:i+1])
        y_train_wfv=y_train_wfv.append(pd.Series(y_test_wfv[i]))
    
    return metrics.confusion_matrix(y_test_wfv, y_pred_wfv), metrics.f1_score(y_test_wfv,y_pred_wfv), metrics.accuracy_score(y_test_wfv,y_pred_wfv)
        
        

In [29]:
lr = LogisticRegression(penalty='l2', solver='lbfgs')

X_train_c=X_train.copy()
y_train_c=y_train.copy()
X_valid_c=X_valid.copy()
y_valid_c=y_valid.copy()


conf_mat, f1sc, accsc = walk_forward_validation(lr, X_train_c, y_train_c, X_valid_c, y_valid_c)

In [30]:
print(conf_mat, f1sc, accsc)

[[ 32 141]
 [ 26 137]] 0.6213151927437642 0.5029761904761905


In [31]:
for vecinos in range(1,31):
    kn = KNeighborsClassifier(n_neighbors=vecinos)
    
    X_train_c=X_train.copy()
    y_train_c=y_train.copy()
    X_valid_c=X_valid.copy()
    y_valid_c=y_valid.copy()


    conf_mat, f1sc, accsc = walk_forward_validation(kn, X_train_c, y_train_c, X_valid_c, y_valid_c)
    print ("Numero de vecinos aplicado: " + str(vecinos))
    print(conf_mat, f1sc, accsc)

Numero de vecinos aplicado: 1
[[ 68 105]
 [ 60 103]] 0.5552560646900269 0.5089285714285714
Numero de vecinos aplicado: 2
[[105  68]
 [ 99  64]] 0.4338983050847458 0.5029761904761905
Numero de vecinos aplicado: 3
[[ 55 118]
 [ 53 110]] 0.5626598465473145 0.49107142857142855
Numero de vecinos aplicado: 4
[[88 85]
 [77 86]] 0.5149700598802395 0.5178571428571429
Numero de vecinos aplicado: 5
[[ 44 129]
 [ 40 123]] 0.5927710843373494 0.49702380952380953
Numero de vecinos aplicado: 6
[[ 72 101]
 [ 65  98]] 0.5414364640883977 0.5059523809523809
Numero de vecinos aplicado: 7
[[ 34 139]
 [ 32 131]] 0.605080831408776 0.49107142857142855
Numero de vecinos aplicado: 8
[[ 56 117]
 [ 46 117]] 0.5894206549118388 0.5148809523809523
Numero de vecinos aplicado: 9
[[ 30 143]
 [ 31 132]] 0.6027397260273972 0.48214285714285715
Numero de vecinos aplicado: 10
[[ 45 128]
 [ 41 122]] 0.5907990314769975 0.49702380952380953
Numero de vecinos aplicado: 11
[[ 26 147]
 [ 26 137]] 0.6129753914988815 0.48511904761904

In [32]:
for profund in range(1,31):
    dt = DecisionTreeClassifier(max_depth=profund)
    
    X_train_c=X_train.copy()
    y_train_c=y_train.copy()
    X_valid_c=X_valid.copy()
    y_valid_c=y_valid.copy()


    conf_mat, f1sc, accsc = walk_forward_validation(dt, X_train_c, y_train_c, X_valid_c, y_valid_c)
    print ("Maxima profundidad aplicada: " + str(profund))
    print(conf_mat, f1sc, accsc)

Maxima profundidad aplicada: 1
[[ 13 160]
 [  9 154]] 0.6457023060796646 0.49702380952380953
Maxima profundidad aplicada: 2
[[ 29 144]
 [ 27 136]] 0.6139954853273139 0.49107142857142855
Maxima profundidad aplicada: 3
[[ 26 147]
 [ 27 136]] 0.6098654708520179 0.48214285714285715
Maxima profundidad aplicada: 4
[[ 35 138]
 [ 37 126]] 0.5901639344262295 0.4791666666666667
Maxima profundidad aplicada: 5
[[ 36 137]
 [ 33 130]] 0.6046511627906976 0.49404761904761907
Maxima profundidad aplicada: 6
[[ 39 134]
 [ 40 123]] 0.5857142857142857 0.48214285714285715
Maxima profundidad aplicada: 7
[[ 44 129]
 [ 42 121]] 0.5859564164648909 0.49107142857142855
Maxima profundidad aplicada: 8
[[ 45 128]
 [ 43 120]] 0.583941605839416 0.49107142857142855
Maxima profundidad aplicada: 9
[[ 50 123]
 [ 50 113]] 0.5664160401002506 0.4851190476190476
Maxima profundidad aplicada: 10
[[ 55 118]
 [ 47 116]] 0.584382871536524 0.5089285714285714
Maxima profundidad aplicada: 11
[[ 62 111]
 [ 58 105]] 0.5540897097625329 

In [33]:
for estimadores in range(1,21):
    adab = AdaBoostClassifier(n_estimators=estimadores)
    
    X_train_c=X_train.copy()
    y_train_c=y_train.copy()
    X_valid_c=X_valid.copy()
    y_valid_c=y_valid.copy()


    conf_mat, f1sc, accsc = walk_forward_validation(adab, X_train_c, y_train_c, X_valid_c, y_valid_c)
    print ("Estimadores aplicados: " + str(estimadores))
    print(conf_mat, f1sc, accsc)

Estimadores aplicados: 1
[[ 13 160]
 [  9 154]] 0.6457023060796646 0.49702380952380953
Estimadores aplicados: 2
[[ 29 144]
 [ 25 138]] 0.6202247191011236 0.49702380952380953
Estimadores aplicados: 3
[[ 45 128]
 [ 42 121]] 0.587378640776699 0.49404761904761907
Estimadores aplicados: 4
[[ 29 144]
 [ 21 142]] 0.6325167037861915 0.5089285714285714
Estimadores aplicados: 5
[[ 34 139]
 [ 25 138]] 0.6272727272727272 0.5119047619047619
Estimadores aplicados: 6
[[ 45 128]
 [ 38 125]] 0.6009615384615384 0.5059523809523809
Estimadores aplicados: 7
[[ 42 131]
 [ 39 124]] 0.5933014354066986 0.49404761904761907
Estimadores aplicados: 8
[[ 52 121]
 [ 45 118]] 0.5870646766169154 0.5059523809523809
Estimadores aplicados: 9
[[ 53 120]
 [ 45 118]] 0.5885286783042394 0.5089285714285714
Estimadores aplicados: 10
[[ 53 120]
 [ 44 119]] 0.5920398009950248 0.5119047619047619
Estimadores aplicados: 11
[[ 51 122]
 [ 42 121]] 0.5960591133004927 0.5119047619047619
Estimadores aplicados: 12
[[ 50 123]
 [ 47 116]] 

In [34]:
dt = DecisionTreeClassifier(max_depth=12)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)

X_train_c=ct.fit_transform(X_train_c)
dt.fit(X_train_c,y_train_c)
y_predict=dt.predict(X_train_c)

(metrics.confusion_matrix(y_train_c, y_predict), metrics.f1_score(y_train_c,y_predict), metrics.accuracy_score(y_train_c,y_predict))

(array([[493, 254],
        [ 72, 847]]),
 0.8386138613861388,
 0.8043217286914766)

In [35]:
dt = DecisionTreeClassifier(max_depth=15)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)

X_train_c=ct.fit_transform(X_train_c)
dt.fit(X_train_c,y_train_c)
y_predict=dt.predict(X_train_c)

(metrics.confusion_matrix(y_train_c, y_predict), metrics.f1_score(y_train_c,y_predict), metrics.accuracy_score(y_train_c,y_predict))

(array([[660,  87],
        [ 70, 849]]),
 0.9153638814016173,
 0.9057623049219687)

In [36]:
adab = AdaBoostClassifier(n_estimators=20)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)

X_train_c=ct.fit_transform(X_train_c)
adab.fit(X_train_c,y_train_c)
y_predict=adab.predict(X_train_c)

(metrics.confusion_matrix(y_train_c, y_predict), metrics.f1_score(y_train_c,y_predict), metrics.accuracy_score(y_train_c,y_predict))

(array([[312, 435],
        [171, 748]]),
 0.7117031398667935,
 0.6362545018007203)

In [37]:
kn = KNeighborsClassifier(n_neighbors=4)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)

X_train_c=ct.fit_transform(X_train_c)
kn.fit(X_train_c,y_train_c)
y_predict=kn.predict(X_train_c)

(metrics.confusion_matrix(y_train_c, y_predict), metrics.f1_score(y_train_c,y_predict), metrics.accuracy_score(y_train_c,y_predict))

(array([[583, 164],
        [306, 613]]),
 0.722877358490566,
 0.7178871548619448)

In [38]:
kn = KNeighborsClassifier(n_neighbors=8)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)

X_train_c=ct.fit_transform(X_train_c)
kn.fit(X_train_c,y_train_c)
y_predict=kn.predict(X_train_c)

(metrics.confusion_matrix(y_train_c, y_predict), metrics.f1_score(y_train_c,y_predict), metrics.accuracy_score(y_train_c,y_predict))

(array([[401, 346],
        [233, 686]]),
 0.7032291132752434,
 0.6524609843937575)

In [39]:
#Este es el código para ver cómo funciona el modelo de verdad (este es el resultado del basic)

dt = DecisionTreeClassifier(max_depth=12)

X_train_c=X_train.append(X_valid)
y_train_c=y_train.append(y_valid)
X_test_c=X_test.copy()
y_test_c=y_test.copy()

conf_mat, f1sc, accsc = walk_forward_validation(dt, X_train_c, y_train_c, X_test_c, y_test_c)

print(conf_mat, f1sc, accsc)

[[ 72 116]
 [ 75 156]] 0.6202783300198808 0.5441527446300716
