In [1]:
import numpy as np
import pandas as pd

In [2]:
dfp = 'data/bitcoin2015to2017.csv'

columns = ['Close']

df = pd.read_csv(dfp)
original_df = df.copy()
time_stamps = df['Timestamp']
df = df.loc[:,columns]


In [3]:
from pandas import DataFrame
from pandas import concat

def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	"""
	Frame a time series as a supervised learning dataset.
	Arguments:
		data: Sequence of observations as a list or NumPy array.
		n_in: Number of lag observations as input (X).
		n_out: Number of observations as output (y).
		dropnan: Boolean whether or not to drop rows with NaN values.
	Returns:
		Pandas DataFrame of series framed for supervised learning.
	"""
	n_vars = 1 if type(data) is list else data.shape[1]
	df = DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

In [4]:
supervised = series_to_supervised(df,256,16)
supervised = np.array(supervised)
original_supervised = supervised.copy()
supervised_timestamp = series_to_supervised(time_stamps[:,None],256,16)
supervised_timestamp = np.array(supervised_timestamp)

In [5]:
supervised_timestamp.shape

(300357, 272)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_supervised=[]
for i in range(len(supervised)):
    scaled_supervised.append(scaler.fit_transform(supervised[i].reshape(-1,1)))
scaled_supervised = np.array(scaled_supervised)

In [7]:
def splitByMonth(df):
    df['month'] = pd.to_datetime(df.Timestamp,unit='s').dt.month + pd.to_datetime(df.Timestamp,unit='s').dt.year * 100
    uniqueMonth = (df['month']).unique()
    mList = list(df['month'])
    splitIndexes = []
    for m in uniqueMonth:
        splitIndexes.append(mList.index(m))    
    return np.sort(splitIndexes)
splitIndexes =   splitByMonth(original_df)

In [8]:
splitIndexes

array([     0,   2650,  11578,  20218,  29146,  37786,  46714,  55642,
        64282,  73210,  81850,  90778,  99706, 108058, 116986, 125626,
       134554, 143194, 152122, 161050, 169690, 178618, 187258, 196186,
       205114, 213178, 222106, 230746, 239674, 248314, 257242, 266170,
       274810, 283738, 292378])

In [9]:
X = supervised[:,:256]
y = supervised[:,256:]
scaled_X = scaled_supervised[:,:256]
scaled_Y = scaled_supervised[:,256:]
supervised_timestamp_X = supervised_timestamp[:,:256]
supervised_timestamp_y = supervised_timestamp[:,256:]

In [10]:
X_split=[]
y_split=[]
scaled_X_split=[]
scaled_y_split=[]
supervised_timestamp_X_split=[]
supervised_timestamp_y_split=[]
for i in range(len(splitIndexes)-5):
    X_split.append(X[splitIndexes[i]:splitIndexes[i+4],:])
    y_split.append(y[splitIndexes[i+4]:splitIndexes[i+5],:])
    scaled_X_split.append(scaled_X[splitIndexes[i]:splitIndexes[i+4],:])
    scaled_y_split.append(scaled_Y[splitIndexes[i+4]:splitIndexes[i+5],:])
    supervised_timestamp_X_split.append(supervised_timestamp_X[splitIndexes[i]:splitIndexes[i+4],:])
    supervised_timestamp_y_split.append(supervised_timestamp_y[splitIndexes[i+4]:splitIndexes[i+5],:])

In [35]:
file_name ='data/bitcoin2015to2017_wf.h5'

In [36]:
import h5py
dt = h5py.special_dtype(vlen=np.dtype('float64'))
with h5py.File(file_name, 'w') as f:
    f.create_dataset("inputs", data = scaled_X_split , dtype=dt)
    f.create_dataset('outputs', data = scaled_y_split, dtype=dt)
    f.create_dataset("input_times", data = supervised_timestamp_X_split, dtype=dt)
    f.create_dataset('output_times', data = supervised_timestamp_y_split, dtype=dt)
    f.create_dataset("original_datas", data=supervised)
    f.create_dataset('original_inputs',data=X_split, dtype=dt)
    f.create_dataset('original_outputs',data=y_split, dtype=dt)