In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics

from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [2]:
btc_hist = pd.read_csv('./BTC-USD.csv')

In [3]:
#Parece que tenemos algunos datos que faltan.

btc_hist[btc_hist.isna().any(axis=1)]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
1568,2020-04-17,,,,,,
1743,2020-10-09,,,,,,
1746,2020-10-12,,,,,,
1747,2020-10-13,,,,,,


In [4]:
#Añadimos a mano los datos, sacándolos de coinmarketcap.com
#creamos una copia con modificaciones para no tocar los datos originales

btc_hist_m=btc_hist.copy()

btc_hist_m.loc[[1747]]=['2020-10-13', 11548.72, 11548.98, 11321.22,
        11425.90, 11425.90, 24241420251.0]
btc_hist_m.loc[[1746]]=['2020-10-12', 11392.64, 11698.47, 11240.69,
        11555.36, 11555.36, 26163972642.0]
btc_hist_m.loc[[1743]]=['2020-10-09', 10927.91, 11102.67, 10846.85,
        11064.46, 11064.46, 22799117613.0]
btc_hist_m.loc[[1568]]=['2020-04-17', 7116.55, 7167.18, 7050.33,
        7096.18, 7096.18, 32513423567.0]

btc_hist_m[btc_hist_m.isna().any(axis=1)]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume


In [5]:
#Transformamos Date a formato fecha
btc_hist_m['Date']=pd.to_datetime(arg=btc_hist_m['Date'],format='%Y-%m-%d')

In [6]:
#Quitamos las columnas que no necesitamos
btc_hist_m = btc_hist_m.drop(labels=['Open','High','Low','Adj Close'], axis=1)

In [7]:
btc_hist_m['dia']=btc_hist_m['Date'].dt.day_name()
btc_hist_m['mes']=btc_hist_m['Date'].dt.month

In [8]:
#añadimos columnas de variaciones de precio desde el día antes, semana antes y mes antes
btc_hist_m['C_dia_ant'] = btc_hist_m['Close'].shift(1)
btc_hist_m['C_dia_ant'].loc[[0]]=btc_hist_m['C_dia_ant'][1]

btc_hist_m['var_dia_ant']=100*btc_hist_m['Close']/btc_hist_m['C_dia_ant']-100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [9]:
btc_hist_m['C_sem_ant'] = btc_hist_m['Close'].shift(7)

for i in range(0,7):
    btc_hist_m['C_sem_ant'].loc[[i]]=btc_hist_m['C_sem_ant'][i+7]

btc_hist_m['var_sem_ant']=100*btc_hist_m['Close']/btc_hist_m['C_sem_ant']-100

In [10]:
btc_hist_m['C_mes_ant'] = btc_hist_m['Close'].shift(30)

for i in range(0,30):
    btc_hist_m['C_mes_ant'].loc[[i]]=btc_hist_m['C_mes_ant'][i+30]

btc_hist_m['var_mes_ant']=100*btc_hist_m['Close']/btc_hist_m['C_mes_ant']-100

In [11]:
#Parece que hay diferencias importantes entre los días de la semana (los lunes sube mucho más que los domingos)
btc_hist_m.groupby('dia')['var_dia_ant'].mean()

dia
Friday       0.359302
Monday       0.607804
Saturday     0.443110
Sunday       0.025161
Thursday     0.143307
Tuesday      0.160206
Wednesday    0.399522
Name: var_dia_ant, dtype: float64

In [12]:
#Parece que hay diferencias significativas entre los meses,
#pero no me atrevo a decir que esto sea representativo de nada
btc_hist_m[btc_hist_m['Date']<'2021-01-01'].groupby('mes')['var_mes_ant'].mean()

mes
1      2.585154
2      1.348607
3     -2.541083
4      9.072330
5     27.567925
6     18.361270
7      4.963626
8     11.638733
9     -1.011568
10     7.408049
11    16.318836
12    20.742637
Name: var_mes_ant, dtype: float64