In [1]:
from urllib.request import urlopen
import numpy as np
import requests
import pandas as pd
import json, time, datetime
import random
import math
import sklearn.preprocessing as prep
from tempfile import TemporaryFile

DATA_MARKET = 'data/poloniex/'
DATA_TWITTER = 'data/twitter/sentiment/'

INPUT_SEQ_LENGTH = 288 # 3 Days - 3*24*60/5
OUTPUT_SEQ_LENGTH = 24 # 2 hours

class PastSampler:

    def __init__(self, N, K, sliding_window = True, step_size=1):
        self.K = K
        self.N = N
        self.sliding_window = sliding_window
        self.step_size = step_size
 
    def transform(self, A):
        M = self.N + self.K     #Number of samples per row (sample + target)
        #indexes
        if self.sliding_window:
            I = np.arange(M) + np.arange(A.shape[0] - M + 1, step=self.step_size).reshape(-1, 1)
        else:
            if A.shape[0]%M == 0:
                I = np.arange(M)+np.arange(0,A.shape[0],M).reshape(-1,1)
                
            else:
                I = np.arange(M)+np.arange(0,A.shape[0] -M,M).reshape(-1,1)    
        #print(I)
        #print(I.shape)
        
        B = A[I].reshape(-1, M * A.shape[1], A.shape[2])
        ci = self.N * A.shape[1]    #Number of features per sample
        #print('ci', ci)
        #print('B shape', B.shape)
        return B[:, :ci], B[:, ci:, 0:1] #Sample matrix, Target matrix


def date_to_timestamp(s):
    return time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple())
    
def print_time(unix, msg=''):
    print(msg, time.ctime(int(unix)))

def split_data(data, s='01/03/2018'):
    split_time = date_to_timestamp(s)
    train = data.query('date<=@split_time')
    test = data.query('date>@split_time')
    return train, test

def download_data():
    # connect to poloniex's API
    CURRENCIES = ['USDT_BTC', 'USDT_LTC', 'USDT_ETH', 'USDT_XRP']
    url = 'https://poloniex.com/public?command=returnChartData&currencyPair=$C&start=1356998100&end=9999999999&period=300'
    urls = [url.replace('$C', c) for c in CURRENCIES]

    for i, c in enumerate(CURRENCIES):
        with urlopen(urls[i]) as url:
            r = url.read()
            d = json.loads(r.decode())
            df = pd.DataFrame(d)
            df = df.drop(columns=['high', 'low', 'open', 'weightedAverage'])
            #print(df.columns)
            df.to_pickle(DATA_MARKET + c + '.pkl')
            print('Successfully downloaded', c)
            print_time(min(df['date']), 'MIN:')
            print_time(max(df['date']), 'MAX:')
            
    
    df_btc = pd.read_pickle(DATA_MARKET + 'USDT_BTC.pkl')
    df_ltc = pd.read_pickle(DATA_MARKET + 'USDT_LTC.pkl')
    df_eth = pd.read_pickle(DATA_MARKET + 'USDT_ETH.pkl')
    df_xrp = pd.read_pickle(DATA_MARKET + 'USDT_XRP.pkl')
    
    
    #combine all dataframes into one with size of smallest dataframe - discard every other value
    count = [min(df_btc.count(numeric_only=True)), min(df_ltc.count(numeric_only=True)), min(df_eth.count(numeric_only=True)), min(df_xrp.count(numeric_only=True))]
    count = min(count)
    print_time(df_ltc['date'].iloc[-count], 'min date:')

    df_btc = df_btc.add_prefix('btc_')
    df_eth = df_eth.add_prefix('eth_')
    df_ltc = df_ltc.add_prefix('ltc_')
    df_xrp = df_xrp.add_prefix('xrp_')

    df_all = pd.concat([df_btc.iloc[-count:].reset_index(drop=True), df_eth.iloc[-count:].reset_index(drop=True), df_ltc.iloc[-count:].reset_index(drop=True), df_xrp.iloc[-count:].reset_index(drop=True)], axis=1)
    df_all.count(numeric_only=True)

    #cuz date column is same for every currency, we will discard others
    df_all.head()
    df_all['date'] = df_all['btc_date']
    df_all = df_all.drop(columns=['btc_date', 'ltc_date', 'eth_date', 'xrp_date'])
    df_all.to_pickle(DATA_MARKET + 'combined.pkl')

    
def load_data():
    """
    
    """    
    price_data = pd.read_pickle(DATA_MARKET + 'combined.pkl')
    sentiment_data = pd.read_pickle(DATA_TWITTER + 'btc_expanded.pkl')
    
    min_date = min(sentiment_data['date'])
    max_date = max(sentiment_data['date'])
    
    price_data = price_data.query('@min_date <= date <= @max_date')
    
    return pd.merge(price_data, sentiment_data, how='inner', left_on='date', right_on='date')

def normalize_fit_transform(X, fields=None):
    """
    Normalize data 
    """
    global scaler 
    scaler = prep.MinMaxScaler()
    if fields is not None:
        X = scaler.fit_transform(X[fields])
    else:
        X = scaler.fit_transform(X)
    return X, scaler

def normalize_transform(X):
    if scaler is None:
        print('Scaler doesnt exist, please use normalize_fit_transform function first')
    else:
        X = scaler.transform(X)
        return X
    
def denormalize_1d(data, min_, scale_):
    data -= min_
    data /= scale_
    return data

def denormalize_full(data):
    if scaler is None:
        print('Scaler doesnt exist, please use normalize_fit_transform function first')
    else:
        X = scaler.inverse_transform(data)
        return X

def fetch_batch_size_random(X, Y, batch_size):
    """
    Returns randomly an aligned batch_size of X and Y among all examples.
    The external dimension of X and Y must be the batch size (eg: 1 column = 1 example).
    X and Y can be N-dimensional.
    """
    assert X.shape[0] == Y.shape[0], (X.shape, Y.shape)
    idxes = np.random.randint(X.shape[0], size=batch_size)
    X_out = np.array(X[idxes]).transpose((1, 0, 2))
    Y_out = np.array(Y[idxes]).transpose((1, 0, 2))
    return X_out, Y_out

X_train = []
Y_train = []
X_test = []
Y_test = []

def prepare_data(input_seq_length, output_seq_length, sliding_window=True, step_size=5):
    data = load_data()
    train, test = split_data(data)

    train = train.drop(columns=['date'])
    test = test.drop(columns=['date'])

    train, _ = normalize_fit_transform(train)
    test = normalize_transform(test)

    ps = PastSampler(input_seq_length, output_seq_length, sliding_window=True, step_size=5)

    X_train, Y_train = ps.transform(train[:,None,:])
    X_test, Y_test = ps.transform(test[:,None,:])
    
    return X_train, Y_train, X_test, Y_test

def generate_data_tf(isTrain, batch_size):
    """
    test
    """
    global Y_train
    global X_train
    global X_test
    global Y_test
    
    if len(Y_test) == 0:
        X_train, Y_train, X_test, Y_test = prepare_data(INPUT_SEQ_LENGTH, OUTPUT_SEQ_LENGTH, sliding_window=True, step_size=5)

    if isTrain:
        return fetch_batch_size_random(X_train, Y_train, batch_size)
    else:
        return fetch_batch_size_random(X_test,  Y_test,  batch_size)

def generate_data_keras(input_seq_length, output_seq_length):
    X_train, Y_train, X_test, Y_test = prepare_data(input_seq_length, output_seq_length, sliding_window=True, step_size=5)
    return X_train, Y_train, X_test, Y_test

In [21]:
X_train, Y_train, X_test, Y_test = generate_data_keras(50, 10)

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(45488, 50, 17) (45488, 10, 1)
(1773, 50, 17) (1773, 10, 1)


In [5]:
np.set_printoptions(suppress=True)

In [10]:
idxes = np.random.randint(500, size=3)
idxes

array([450, 363,  74])

In [10]:
data = load_data()
print_time(min(data['date']), 'MIN')
print_time(max(data['date']), 'MAX')
#print(data.info())
ps = PastSampler(INPUT_SEQ_LENGTH, OUTPUT_SEQ_LENGTH, sliding_window=False, step_size=5)

data_norm, _scaler = normalize_fit_transform(data)
print(data_norm.shape)
x, y = ps.transform(data_norm[:,None,:])


MIN Fri Jan  1 01:55:00 2016
MAX Sun Apr  1 00:40:00 2018
(236422, 18)
ci 864
B shape (266, 888, 18)


In [13]:
y_denorm = _scaler.inverse_transform(y)

ValueError: Found array with dim 3. Estimator expected <= 2.

In [12]:
print(x.shape, y.shape)

(266, 864, 18) (266, 24, 1)


In [11]:
print(_scaler.scale_)
print(_scaler.data_min_)
print(_scaler.data_max_)
print(_scaler.data_range_)

[0.00005108 0.00080358 0.00000011 0.00070345 0.00005184 0.00000038
 0.00272846 0.00003035 0.00000041 0.30533256 0.00000009 0.00000018
 0.00000001 0.54848092 0.00001285 0.20402215 4.7191309  0.5       ]
[ 3.20000000e+02  0.00000000e+00  0.00000000e+00  9.14500000e-01
  0.00000000e+00  0.00000000e+00  1.64298200e+00  0.00000000e+00
  0.00000000e+00  4.65250000e-03  0.00000000e+00  0.00000000e+00
  1.45160970e+09 -8.80753846e-01  0.00000000e+00 -1.43142857e+00
  7.88091068e-01 -1.00000000e+00]
[1.98966873e+04 1.24443795e+03 8.90561191e+06 1.42247000e+03
 1.92911964e+04 2.63597001e+06 3.68150478e+02 3.29514590e+04
 2.44343746e+06 3.27976999e+00 1.09003565e+07 5.60500361e+06
 1.52253600e+09 9.42463636e-01 7.78133333e+04 3.47000000e+00
 9.99994493e-01 1.00000000e+00]
[   19576.6872996      1244.43795046  8905611.9065042      1421.55549992
    19291.19637626  2635970.0134316       366.5074956     32951.45899596
  2443437.4649415         3.27511749 10900356.479903    5605003.6082574
 70926300.

In [12]:
1*24*60/5

288.0

In [13]:
x, y, z, w = generate_data_keras(288, 24)

In [14]:
x.nbytes/1024/1024

1697.26904296875

In [15]:
y.nbytes/1024/1024

8.3199462890625

In [16]:
z.nbytes/1024/1024

64.360107421875

In [17]:
w.nbytes/1024/1024

0.31549072265625