In [1]:
import pandas as pd
import numpy as np

In [2]:
#cargamos los datos diarios de precios del Oro, desde 01/01/2015 a 26/09/2021
gold_hist = pd.read_csv('./Gold.csv', sep=';')

In [3]:
#Comprobamos que no faltan datos - faltan
#Tras investigar un poco vemos que las fechas que no tienen datos (aunque sea sólo el volumen) no deberían estar
#así que la primera modificación sobre los datos será eliminarlas

gold_hist[gold_hist.isna().any(axis=1)]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
10,"Sep 10, 2021",1789.6,1789.6,1789.6,1789.6,1789.6,
188,"Dec 24, 2020",,,,,,
207,"Nov 27, 2020",,,,,,
441,"Dec 24, 2019",,,,,,
458,"Nov 29, 2019",,,,,,
562,"Jul 03, 2019",,,,,,
588,"May 28, 2019",1276.5,1276.5,1276.5,1276.5,1276.5,
590,"May 23, 2019",1284.8,1284.8,1284.8,1284.8,1284.8,
591,"May 22, 2019",1273.6,1273.6,1273.6,1273.6,1273.6,
592,"May 21, 2019",1272.0,1272.0,1272.0,1272.0,1272.0,


In [4]:
gold_hist.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,"Sep 24, 2021",1749.7,1749.7,1745.0,1749.7,1749.7,31
1,"Sep 23, 2021",1766.8,1766.8,1738.2,1747.7,1747.7,31
2,"Sep 22, 2021",1776.7,1784.3,1767.0,1776.7,1776.7,17
3,"Sep 21, 2021",1762.2,1776.0,1759.0,1776.0,1776.0,106
4,"Sep 20, 2021",1751.8,1761.8,1750.9,1761.8,1761.8,185
5,"Sep 17, 2021",1758.8,1763.0,1749.4,1749.4,1749.4,581
6,"Sep 16, 2021",1792.9,1792.9,1743.9,1754.6,1754.6,1645
7,"Sep 15, 2021",1803.7,1803.7,1792.4,1792.4,1792.4,59
8,"Sep 14, 2021",1791.4,1806.2,1783.0,1804.7,1804.7,764
9,"Sep 13, 2021",1789.8,1796.0,1784.0,1792.0,1792.0,533


In [5]:
#Nos vamos a poner a modificar datos, así que hacemos una copia
gold_hist_m=gold_hist.copy()

In [6]:
#eliminamos las filas con algún nan, como hemos explicado arriba
gold_hist_m.shape

(1695, 7)

In [7]:
gold_hist_m=gold_hist_m.dropna()
gold_hist_m.shape

(1654, 7)

In [8]:
#Transformamos Date a formato fecha
gold_hist_m['Date']=pd.to_datetime(arg=gold_hist_m['Date'])
#y Close a formato numérico
gold_hist_m['Close']=pd.to_numeric(gold_hist_m['Close'], errors='coerce')

In [9]:
#Quitamos las columnas que no necesitamos (todas menos los cierres)
gold_hist_m = gold_hist_m.drop(labels=['Open','High','Low','Adj Close','Volume'], axis=1)

In [10]:
#reordenamos los datos porque necesitamos que estén de más antiguos a más nuevos para el resto del código
#y ahora están exactamente al revés
gold_hist_m=gold_hist_m.reindex(index=gold_hist_m.index[::-1])
gold_hist_m.reset_index(inplace=True, drop=True)

In [11]:
gold_hist_m.head(10)

Unnamed: 0,Date,Close
0,2015-01-02,1186.0
1,2015-01-05,1203.9
2,2015-01-06,1219.3
3,2015-01-07,1210.6
4,2015-01-08,1208.4
5,2015-01-09,1216.0
6,2015-01-12,1232.7
7,2015-01-13,1234.3
8,2015-01-14,1234.4
9,2015-01-15,1264.7


In [12]:
#añadimos columnas de variaciones de precio desde el día antes, semana antes y mes antes
gold_hist_m['C_dia_ant'] = gold_hist_m['Close'].shift(1)
gold_hist_m['C_dia_ant'].loc[[0]]=gold_hist_m['C_dia_ant'][1]

gold_hist_m['var_dia_ant']=gold_hist_m['Close']/gold_hist_m['C_dia_ant']-1


gold_hist_m['C_sem_ant'] = gold_hist_m['Close'].shift(5)

for i in range(0,5):
    gold_hist_m['C_sem_ant'].loc[[i]]=gold_hist_m['C_sem_ant'][i+5]

gold_hist_m['var_sem_ant']=gold_hist_m['Close']/gold_hist_m['C_sem_ant']-1


gold_hist_m['C_mes_ant'] = gold_hist_m['Close'].shift(20)

for i in range(0,20):
    gold_hist_m['C_mes_ant'].loc[[i]]=gold_hist_m['C_mes_ant'][i+20]

gold_hist_m['var_mes_ant']=gold_hist_m['Close']/gold_hist_m['C_mes_ant']-1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [13]:
#ahora creamos el set de datos de verdad, que por ahora va a contener:
#los datos de variación de precios diarios de las últimas 4 semanas (20 días, el Nasdaq sólo cotiza 5 días/semana)
#los datos de variación de precios semanales desde 4 semanas antes a 12 semanas antes
#los datos de variación de precios mensuales desde 3 meses antes a 11 meses antes
#en este caso no se incluyen datos objetivo, ya que no queremos predecir nada del Nasdaq,
#únicamente son datos de referencia que creemos que pueden ayudar en las predicciones de las cripto
#todo esto desde el 29 de diciembre de 2015, más tarde separaremos los datos de training y de test
#el 29 de diciembre está elegido para estar seguros de tener datos anteriores al 1 de enero y poder usarlos
#como datos del 1 de enero para cruzar con los datos del bitcoin

columnasData=['varP0', 'varP1', 'varP2', 'varP3', 'varP4', 'varP5', 'varP6', 'varP7', 'varP8', 'varP9',
             'varP10', 'varP11', 'varP12', 'varP13', 'varP14', 'varP15', 'varP16', 'varP17', 'varP18', 'varP19',
             'varPs20', 'varPs25', 'varPs30', 'varPs35', 'varPs40', 'varPs45', 'varPs50', 'varPs55', 'varPs60',
             'varPm65', 'varPm85', 'varPm105', 'varPm125', 'varPm145', 'varPm165', 'varPm185', 'varPm205',
             'varPm225']
data_for_use = pd.DataFrame(columns=columnasData)


for i in range(0,gold_hist_m.shape[0]-1):
    if gold_hist_m['Date'][i]>pd.to_datetime(arg='2015-12-28',format='%Y-%m-%d'):
        #rellenamos las fechas en las que el Oro no ha cotizado con los datos del día cotizado anterior
        if gold_hist_m['Date'][i]>pd.to_datetime(arg='2015-12-31',format='%Y-%m-%d') and \
            (gold_hist_m['Date'].loc[[i]].reset_index(drop=True)-pd.Timedelta(days=1) > \
            gold_hist_m['Date'].loc[[i-1]].reset_index(drop=True)).all():
            
            j=1
            while (gold_hist_m['Date'].loc[[i]].reset_index(drop=True)-pd.Timedelta(days=j) \
                > gold_hist_m['Date'].loc[[i-1]].reset_index(drop=True)).all():
                
                dtemp = dtemp.set_index(keys=[gold_hist_m['Date'].loc[[i-1]]+pd.Timedelta(days=j)])
                data_for_use = data_for_use.append(dtemp)
                j=j+1
        
        
        dtemp = pd.concat([pd.DataFrame(gold_hist_m['var_dia_ant'][i-19:i+1][::-1].values),
                            pd.DataFrame(gold_hist_m['var_sem_ant'][i-64:i-19][::-5].values),
                            pd.DataFrame(gold_hist_m['var_mes_ant'][i-244:i-64][::-20].values)],
                          ignore_index=True, axis=0).T

        dtemp = dtemp.set_index(keys=[gold_hist_m['Date'].loc[[i]]])
        
        new_cols = {x: y for x, y in zip(dtemp.columns, data_for_use.columns)}

        dtemp = dtemp.rename(columns=new_cols)

        data_for_use = data_for_use.append(dtemp)

In [14]:
data_for_use.head(20)

Unnamed: 0,varP0,varP1,varP2,varP3,varP4,varP5,varP6,varP7,varP8,varP9,...,varPs60,varPm65,varPm85,varPm105,varPm125,varPm145,varPm165,varPm185,varPm205,varPm225
2015-12-29,0.006539,-0.00622,0.007294,-0.005024,-0.006563,0.014725,0.014656,-0.025232,0.014206,-0.001691,...,-0.007766,0.021026,0.031239,-0.068948,-0.013252,-0.007124,-0.014373,0.037894,-0.054705,0.021102
2015-12-30,-0.016148,0.006539,-0.00622,0.007294,-0.005024,-0.006563,0.014725,0.014656,-0.025232,0.014206,...,0.005389,-0.000971,0.034889,-0.058555,-0.010129,-0.012938,-0.010639,0.037066,-0.065039,0.020395
2015-12-31,0.000189,-0.016148,0.006539,-0.00622,0.007294,-0.005024,-0.006563,0.014725,0.014656,-0.025232,...,0.017478,-0.003977,0.038737,-0.071191,0.004367,-0.012348,-0.009383,0.037282,-0.065687,-0.000892
2016-01-01,0.000189,-0.016148,0.006539,-0.00622,0.007294,-0.005024,-0.006563,0.014725,0.014656,-0.025232,...,0.017478,-0.003977,0.038737,-0.071191,0.004367,-0.012348,-0.009383,0.037282,-0.065687,-0.000892
2016-01-02,0.000189,-0.016148,0.006539,-0.00622,0.007294,-0.005024,-0.006563,0.014725,0.014656,-0.025232,...,0.017478,-0.003977,0.038737,-0.071191,0.004367,-0.012348,-0.009383,0.037282,-0.065687,-0.000892
2016-01-03,0.000189,-0.016148,0.006539,-0.00622,0.007294,-0.005024,-0.006563,0.014725,0.014656,-0.025232,...,0.017478,-0.003977,0.038737,-0.071191,0.004367,-0.012348,-0.009383,0.037282,-0.065687,-0.000892
2016-01-04,0.013958,0.000189,-0.016148,0.006539,-0.00622,0.007294,-0.005024,-0.006563,0.014725,0.014656,...,0.030031,-0.020374,0.044008,-0.05354,-0.017729,-0.013371,-0.012867,0.045569,-0.054881,-0.012396
2016-01-05,0.003069,0.013958,0.000189,-0.016148,0.006539,-0.00622,0.007294,-0.005024,-0.006563,0.014725,...,0.027374,-0.016159,0.043106,-0.066707,-0.011892,-0.004986,-0.013424,0.040517,-0.055323,-0.011585
2016-01-06,0.012519,0.003069,0.013958,0.000189,-0.016148,0.006539,-0.00622,0.007294,-0.005024,-0.006563,...,0.016885,0.011925,0.030728,-0.059362,-0.022848,-0.00545,-0.000168,0.03425,-0.059682,-0.030205
2016-01-07,0.01447,0.012519,0.003069,0.013958,0.000189,-0.016148,0.006539,-0.00622,0.007294,-0.005024,...,0.023548,0.015617,0.024221,-0.054937,-0.018815,-0.031599,0.014066,0.046329,-0.049499,-0.05388


In [15]:
data_for_use.to_csv('gold_data_for_use.csv')