In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics

from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import numbers
from sklearn import model_selection

In [12]:
#cargamos los datos diarios de precios de bitcoin, desde 01/01/2016 a 14/09/2021
btc_hist = pd.read_csv('./BTC-USD.csv')

In [13]:
#Parece que tenemos algunos datos que faltan.

btc_hist[btc_hist.isna().any(axis=1)]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
1568,2020-04-17,,,,,,
1743,2020-10-09,,,,,,
1746,2020-10-12,,,,,,
1747,2020-10-13,,,,,,


In [14]:
#Añadimos a mano los datos, sacándolos de coinmarketcap.com
#creamos una copia con modificaciones para no tocar los datos originales

btc_hist_m=btc_hist.copy()

btc_hist_m.loc[[1747]]=['2020-10-13', 11548.72, 11548.98, 11321.22,
        11425.90, 11425.90, 24241420251.0]
btc_hist_m.loc[[1746]]=['2020-10-12', 11392.64, 11698.47, 11240.69,
        11555.36, 11555.36, 26163972642.0]
btc_hist_m.loc[[1743]]=['2020-10-09', 10927.91, 11102.67, 10846.85,
        11064.46, 11064.46, 22799117613.0]
btc_hist_m.loc[[1568]]=['2020-04-17', 7116.55, 7167.18, 7050.33,
        7096.18, 7096.18, 32513423567.0]

btc_hist_m[btc_hist_m.isna().any(axis=1)]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume


In [15]:
#Transformamos Date a formato fecha
btc_hist_m['Date']=pd.to_datetime(arg=btc_hist_m['Date'],format='%Y-%m-%d')

In [16]:
#Quitamos las columnas que no necesitamos
btc_hist_m = btc_hist_m.drop(labels=['Open','High','Low','Adj Close'], axis=1)

In [17]:
btc_hist_m['dia']=btc_hist_m['Date'].dt.day_name()
btc_hist_m['mes']=btc_hist_m['Date'].dt.month

In [18]:
#añadimos columnas de variaciones de precio desde el día antes, semana antes y mes antes
btc_hist_m['C_dia_ant'] = btc_hist_m['Close'].shift(1)
btc_hist_m['C_dia_ant'].loc[[0]]=btc_hist_m['C_dia_ant'][1]

btc_hist_m['var_dia_ant']=100*btc_hist_m['Close']/btc_hist_m['C_dia_ant']-100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [19]:
btc_hist_m['C_sem_ant'] = btc_hist_m['Close'].shift(7)

for i in range(0,7):
    btc_hist_m['C_sem_ant'].loc[[i]]=btc_hist_m['C_sem_ant'][i+7]

btc_hist_m['var_sem_ant']=100*btc_hist_m['Close']/btc_hist_m['C_sem_ant']-100

In [20]:
btc_hist_m['C_mes_ant'] = btc_hist_m['Close'].shift(30)

for i in range(0,30):
    btc_hist_m['C_mes_ant'].loc[[i]]=btc_hist_m['C_mes_ant'][i+30]

btc_hist_m['var_mes_ant']=100*btc_hist_m['Close']/btc_hist_m['C_mes_ant']-100

In [21]:
#Parece que hay diferencias importantes entre los días de la semana (los lunes sube mucho más que los domingos)
btc_hist_m.groupby('dia')['var_dia_ant'].mean()

dia
Friday       0.359302
Monday       0.607804
Saturday     0.443110
Sunday       0.025161
Thursday     0.143307
Tuesday      0.160206
Wednesday    0.399522
Name: var_dia_ant, dtype: float64

In [52]:
#ahora vamos a mirar si hay diferencia significativa en el número de veces que ha subido la cotización por día de la semana
#los "mejores" días cambian respecto a los de las magnitudes de las variaciones, pero sigue habiendo diferencias importantes
#en este caso hay un 7,5% más de posibilidades de que la cotización suba un sábado que de que suba un martes
btc_hist_m.groupby('dia')['var_dia_ant'].apply(lambda x: (x>0).sum()/(x).count())*100

dia
Friday       57.046980
Monday       54.026846
Saturday     59.731544
Sunday       53.355705
Thursday     53.020134
Tuesday      52.348993
Wednesday    57.046980
Name: var_dia_ant, dtype: float64

In [53]:
#Parece que hay diferencias significativas entre los meses,
#pero no me atrevo a decir que esto sea representativo de nada
#porque tenemos muy pocos meses de cada (4 años y pico de datos)
btc_hist_m[btc_hist_m['Date']<'2021-01-01'].groupby('mes')['var_mes_ant'].mean()

mes
1      2.585154
2      1.348607
3     -2.541083
4      9.072330
5     27.567925
6     18.361270
7      4.963626
8     11.638733
9     -1.011568
10     7.408049
11    16.318836
12    20.742637
Name: var_mes_ant, dtype: float64

In [54]:
#añadimos columnas de variaciones de volumen desde el día antes
btc_hist_m['V_dia_ant'] = btc_hist_m['Volume'].shift(1)
btc_hist_m['V_dia_ant'].loc[[0]]=btc_hist_m['V_dia_ant'][1]

btc_hist_m['var_V_dia_ant']=100*btc_hist_m['Volume']/btc_hist_m['V_dia_ant']-100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [63]:
#ahora creamos el set de datos de verdad, que por ahora va a contener:
#los datos de variación de precios diarios de los últimos 30 días
#los datos de variación de volúmenes diarios de los últimos 30 días
#los datos de variación de precios semanales desde 4 semanas antes a 12 semanas antes
#los datos de variación de precios mensuales desde 3 meses antes a 11 meses antes
#el día de la semana
#si el precio subió (1) o no (0) al día siguiente. Esta es la columna objetivo
#todo esto desde el 1 de enero de 2017, más tarde separaremos los datos de training y de test

columnasData=['varP0', 'varP1', 'varP2', 'varP3', 'varP4', 'varP5', 'varP6', 'varP7', 'varP8', 'varP9',
             'varP10', 'varP11', 'varP12', 'varP13', 'varP14', 'varP15', 'varP16', 'varP17', 'varP18', 'varP19',
             'varP20', 'varP21', 'varP22', 'varP23', 'varP24', 'varP25', 'varP26', 'varP27', 'varP28', 'varP29',
             'varPs30', 'varPs37', 'varPs44', 'varPs51', 'varPs58', 'varPs65', 'varPs72', 'varPs79', 'varPs86',
             'varPm93', 'varPm123', 'varPm153', 'varPm183', 'varPm213', 'varPm243', 'varPm273', 'varPm303',
             'varPm333',
             'varV0', 'varV1', 'varV2', 'varV3', 'varV4', 'varV5', 'varV6', 'varV7', 'varV8', 'varV9',
             'varV10', 'varV11', 'varV12', 'varV13', 'varV14', 'varV15', 'varV16', 'varV17', 'varV18', 'varV19',
             'varV20', 'varV21', 'varV22', 'varV23', 'varV24', 'varV25', 'varV26', 'varV27', 'varV28', 'varV29',
             'dia', 'subida']
data_for_use = pd.DataFrame(columns=columnasData)


for i in range(0,btc_hist_m.shape[0]-1):
    if btc_hist_m['Date'][i]>pd.to_datetime(arg='2016-12-31',format='%Y-%m-%d'):
        dtemp = pd.concat([pd.DataFrame(btc_hist_m['var_dia_ant'][i-29:i+1][::-1].values),
                            pd.DataFrame(btc_hist_m['var_sem_ant'][i-92:i-29][::-7].values),
                            pd.DataFrame(btc_hist_m['var_mes_ant'][i-362:i-92][::-30].values),
                            pd.DataFrame(btc_hist_m['var_V_dia_ant'][i-29:i+1][::-1].values),
                            pd.DataFrame([btc_hist_m['dia'][i]]),
                            pd.DataFrame([btc_hist_m['var_dia_ant'][i+1]>0])],
                          ignore_index=True, axis=0).T

        dtemp = dtemp.set_index(keys=[btc_hist_m['Date'].loc[[i]]])
        
        new_cols = {x: y for x, y in zip(dtemp.columns, data_for_use.columns)}

        dtemp = dtemp.rename(columns=new_cols)

        data_for_use = data_for_use.append(dtemp.rename(columns=new_cols))



Unnamed: 0,varP0,varP1,varP2,varP3,varP4,varP5,varP6,varP7,varP8,varP9,...,varV22,varV23,varV24,varV25,varV26,varV27,varV28,varV29,dia,subida
2017-01-01,3.5883,0.260602,-1.25928,-0.248382,4.57813,2.81927,1.27507,-0.293611,-2.51219,6.64446,...,-21.6318,-14.2383,-16.9189,-17.03,9.26544,75.6385,-12.9256,-45.498,Sunday,True
2017-01-02,2.34643,3.5883,0.260602,-1.25928,-0.248382,4.57813,2.81927,1.27507,-0.293611,-2.51219,...,6.44521,-21.6318,-14.2383,-16.9189,-17.03,9.26544,75.6385,-12.9256,Monday,True
2017-01-03,2.16197,2.34643,3.5883,0.260602,-1.25928,-0.248382,4.57813,2.81927,1.27507,-0.293611,...,33.6005,6.44521,-21.6318,-14.2383,-16.9189,-17.03,9.26544,75.6385,Tuesday,True
2017-01-04,10.6233,2.16197,2.34643,3.5883,0.260602,-1.25928,-0.248382,4.57813,2.81927,1.27507,...,6.62731,33.6005,6.44521,-21.6318,-14.2383,-16.9189,-17.03,9.26544,Wednesday,False
2017-01-05,-12.241,10.6233,2.16197,2.34643,3.5883,0.260602,-1.25928,-0.248382,4.57813,2.81927,...,-6.94048,6.62731,33.6005,6.44521,-21.6318,-14.2383,-16.9189,-17.03,Thursday,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-11,0.707485,-3.24955,0.650949,-1.53754,-11.0622,1.70061,3.62158,-0.161418,1.41432,0.984083,...,-6.71278,15.5624,-3.75841,2.05781,5.76953,-0.712548,-1.67874,-5.86936,Saturday,True
2021-09-12,1.9066,0.707485,-3.24955,0.650949,-1.53754,-11.0622,1.70061,3.62158,-0.161418,1.41432,...,16.9371,-6.71278,15.5624,-3.75841,2.05781,5.76953,-0.712548,-1.67874,Sunday,False
2021-09-13,-2.38844,1.9066,0.707485,-3.24955,0.650949,-1.53754,-11.0622,1.70061,3.62158,-0.161418,...,-37.4871,16.9371,-6.71278,15.5624,-3.75841,2.05781,5.76953,-0.712548,Monday,True
2021-09-14,4.73593,-2.38844,1.9066,0.707485,-3.24955,0.650949,-1.53754,-11.0622,1.70061,3.62158,...,35.2138,-37.4871,16.9371,-6.71278,15.5624,-3.75841,2.05781,5.76953,Tuesday,True


In [65]:
data_for_use.to_csv('bitcoin_data_for_use_v1.csv')

## Aquí es donde se pueden empezar futuras ejecuciones con los datos ya mascados

In [2]:
data_for_use = pd.read_csv('./bitcoin_data_for_use_v1.csv', index_col=0)

In [13]:
# Partimos los datos en train y test por fechas, sin que sea aleatorio, que es como funcionaría en la realidad el modelo
data_train, data_test = train_test_split(data_for_use, shuffle=False, test_size=0.2)

In [14]:
data_test.shape

(344, 80)

In [15]:
X_train=data_train.drop(labels=['subida'], axis=1)
y_train=data_train['subida']

X_test=data_test.drop(labels=['subida'], axis=1)
y_test=data_test['subida']

In [16]:
y_train.sum()/y_train.count()

0.5447272727272727

In [17]:
y_test.sum()/y_test.count()

0.5581395348837209

In [18]:
#tenemos que hacer one-hot encoding sobre la columna "dia" y escalar el resto, que son numéricas

number_columns = X_train.select_dtypes('number').columns

t=[('dia', 
    OneHotEncoder(handle_unknown='ignore'),
    ['dia']),
    ('scaler', StandardScaler(),number_columns)
    ]

ct = ColumnTransformer(transformers=t, remainder='passthrough')

X_train=ct.fit_transform(X_train)
X_test=ct.fit_transform(X_test)

In [19]:
#Probamos con LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
y_pred = lr.predict(X_test)

In [21]:
metrics.confusion_matrix(y_test, y_pred)

array([[ 54,  98],
       [ 79, 113]], dtype=int64)

In [22]:
metrics.f1_score(y_test,y_pred)

0.5607940446650124

In [23]:
metrics.accuracy_score(y_test,y_pred)

0.48546511627906974

In [24]:
#Probamos con DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=20)
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=20, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [25]:
y_pred = dt.predict(X_test)

In [26]:
metrics.confusion_matrix(y_test, y_pred)

array([[72, 80],
       [94, 98]], dtype=int64)

In [27]:
metrics.f1_score(y_test,y_pred)

0.5297297297297298

In [28]:
metrics.accuracy_score(y_test,y_pred)

0.4941860465116279

In [44]:
#Probamos con SVC
sv = SVC(probability=True, kernel='poly', degree=1)
sv.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=1, gamma='scale', kernel='poly',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [45]:
y_pred = sv.predict(X_test)

In [46]:
metrics.confusion_matrix(y_test, y_pred)

array([[ 42, 110],
       [ 43, 149]], dtype=int64)

In [47]:
metrics.f1_score(y_test,y_pred)

0.6607538802660754

In [48]:
metrics.accuracy_score(y_test,y_pred)

0.5552325581395349

In [34]:
#Probamos con GaussianNB
gn = GaussianNB()
gn.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [35]:
y_pred = gn.predict(X_test)

In [36]:
metrics.confusion_matrix(y_test, y_pred)

array([[107,  45],
       [148,  44]], dtype=int64)

In [37]:
metrics.f1_score(y_test,y_pred)

0.3131672597864769

In [38]:
metrics.accuracy_score(y_test,y_pred)

0.438953488372093

In [39]:
#probamos con KNeighborsClassifier
kn = KNeighborsClassifier(n_neighbors=10)
kn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [40]:
y_pred = kn.predict(X_test)

In [41]:
metrics.confusion_matrix(y_test, y_pred)

array([[ 74,  78],
       [ 89, 103]], dtype=int64)

In [42]:
metrics.f1_score(y_test,y_pred)

0.5522788203753352

In [43]:
metrics.accuracy_score(y_test,y_pred)

0.5145348837209303

## Esto del gridsearch habría que hacerlo para todos los modelos

In [513]:
kn = KNeighborsClassifier()
gs = model_selection.GridSearchCV(kn, param_grid={'n_neighbors':range(1, 50)}, cv=3, scoring='accuracy')
gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': range(1, 50)}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring='accuracy',
             verbose=0)

In [514]:
gs.best_params_

{'n_neighbors': 49}