In [1]:
# downloading the data
# !wget -P ./data https://pythonprogramming.net/static/downloads/machine-learning-data/crypto_data.zip

In [19]:
import os
import random
from collections import deque

import numpy as np
import pandas as pd


from sklearn.preprocessing import scale

In [3]:
# VARIABLES

# path
CWD = os.getcwd()
DATADIR = os.path.join(CWD, 'data', 'crypto_data')

# ml
SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 3
RATIO_TO_PREDICT = 'LTC-USD'

In [4]:
# if future price is greater than current prices, buy the asset, coded as 1
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

In [21]:
def preprocess_df(df):
    df = df.drop('future', 1)
    
    for col in df.columns:
        if col != 'target':
            df.loc[:, col] = df.loc[:, col].pct_change()
            df = df.dropna()
            df.loc[:, col] = scale(df.loc[:, col].values)
    df = df.dropna()
    
    sequential_data = []
    prev_days = deque(maxlen=SEQ_LEN)
    
    for i in df.values:
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days), i[-1]])
            
    return random.shuffle(sequential_data)
    
new_df = preprocess_df(main_df)

In [23]:
new_df

In [5]:
# name of the columns
names = ['time', 'low', 'high', 'open', 'close', 'volume']
# import the data with pandas
df = pd.read_csv(os.path.join(DATADIR, 'LTC-USD.csv'), names=names)
df.head()

Unnamed: 0,time,low,high,open,close,volume
0,1528968660,96.580002,96.589996,96.589996,96.580002,9.6472
1,1528968720,96.449997,96.669998,96.589996,96.660004,314.387024
2,1528968780,96.470001,96.57,96.57,96.57,77.129799
3,1528968840,96.449997,96.57,96.57,96.5,7.216067
4,1528968900,96.279999,96.540001,96.5,96.389999,524.539978


In [6]:
main_df = pd.DataFrame()

# file names
ratios = ['BCH-USD', 'BTC-USD', 'ETH-USD', 'LTC-USD']

# join all files into one big dataframe called main_df
for ratio in ratios:
    dataset = os.path.join(DATADIR, '{}.csv'.format(ratio))
    df = pd.read_csv(dataset, names=names)
    
    df = df.rename(columns={'close': '{}_close'.format(ratio), 'volume': '{}_volume'.format(ratio)})
    df = df.set_index('time')
    df = df.loc[:, ['{}_close'.format(ratio), '{}_volume'.format(ratio)]]
    
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)
        
# make future price by negative shifting the data by FUTURE_PERIOD_PREDICT
main_df.loc[:, 'future'] = main_df.loc[:, '{}_close'.format(RATIO_TO_PREDICT)].shift(-FUTURE_PERIOD_PREDICT)

# make target by using the classify function
main_df.loc[:, 'target'] = list(map(classify,
                                    main_df.loc[:, '{}_close'.format(RATIO_TO_PREDICT)],
                                    main_df.loc[:, 'future']))

In [7]:
# check if everything worked
main_df.loc[:, ['{}_close'.format(RATIO_TO_PREDICT), 'future', 'target']].head(10)

Unnamed: 0_level_0,LTC-USD_close,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1528968660,96.580002,96.5,0
1528968720,96.660004,96.389999,0
1528968780,96.57,96.519997,0
1528968840,96.5,96.440002,0
1528968900,96.389999,96.470001,1
1528968960,96.519997,96.400002,0
1528969020,96.440002,96.400002,0
1528969080,96.470001,96.400002,0
1528969140,96.400002,96.400002,0
1528969200,96.400002,96.400002,0


In [10]:
times = sorted(main_df.index.values)
last5pct = times[-int(0.05 * len(times))]

validation_main_df = main_df.loc[main_df.index >= last5pct, ]
main_df = main_df.loc[main_df.index < last5pct, ]

X_train, y_train= preprocess_df(main_df)
X_val, y_val= preprocess_df(validation_main_df)

1534879920