In [33]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM


In [23]:
SEQ_LEN = 60
FUTURE_PERIOD_PREDICT=3
RATIO_TO_PREDICT="LTC-USD"


In [21]:

df = pd.read_csv("BTC-USD.csv", names=["time","low","high","open","close","volume"])
df.head()
main_df = pd.DataFrame()

ratios = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]  # the 4 ratios we want to consider
for ratio in ratios:  # begin iteration
    print(ratio)
    dataset = f'{ratio}.csv'  # get the full path to the file.
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])  # read in specific file

    # rename volume and close to include the ticker so we can still which close/volume is which:
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)

    df.set_index("time", inplace=True)  # set time as index so we can join them on this shared time
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]  # ignore the other columns besides price and volume

    if len(main_df)==0:  # if the dataframe is empty
        main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        main_df = main_df.join(df)

main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)
main_df.head(10)

BTC-USD
LTC-USD
BCH-USD
ETH-USD


Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,BCH-USD_close,BCH-USD_volume,ETH-USD_close,ETH-USD_volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1528968720,6487.379883,7.706374,96.660004,314.387024,870.859985,26.856577,486.01001,26.019083
1528968780,6479.410156,3.088252,96.57,77.129799,870.099976,1.1243,486.0,8.4494
1528968840,6479.410156,1.4041,96.5,7.216067,870.789978,1.749862,485.75,26.994646
1528968900,6479.97998,0.753,96.389999,524.539978,870.0,1.6805,486.0,77.355759
1528968960,6480.0,1.4909,96.519997,16.991997,869.98999,1.669014,486.0,7.5033
1528969020,6477.220215,2.73195,96.440002,95.524078,869.450012,0.8652,485.98999,85.877251
1528969080,6480.0,2.17424,96.470001,175.205307,869.98999,23.534929,485.98999,160.915192
1528969140,6479.990234,0.9031,96.400002,43.652802,870.0,2.3,485.98999,61.371887
1528969200,6478.660156,3.258786,96.400002,8.16,870.320007,9.255514,485.98999,42.687656
1528969260,6478.660156,1.970352,96.400002,20.4259,870.650024,2.7956,486.0,97.693878


In [25]:
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
main_df.head(10)

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,BCH-USD_close,BCH-USD_volume,ETH-USD_close,ETH-USD_volume,future
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1528968720,6487.379883,7.706374,96.660004,314.387024,870.859985,26.856577,486.01001,26.019083,96.389999
1528968780,6479.410156,3.088252,96.57,77.129799,870.099976,1.1243,486.0,8.4494,96.519997
1528968840,6479.410156,1.4041,96.5,7.216067,870.789978,1.749862,485.75,26.994646,96.440002
1528968900,6479.97998,0.753,96.389999,524.539978,870.0,1.6805,486.0,77.355759,96.470001
1528968960,6480.0,1.4909,96.519997,16.991997,869.98999,1.669014,486.0,7.5033,96.400002
1528969020,6477.220215,2.73195,96.440002,95.524078,869.450012,0.8652,485.98999,85.877251,96.400002
1528969080,6480.0,2.17424,96.470001,175.205307,869.98999,23.534929,485.98999,160.915192,96.400002
1528969140,6479.990234,0.9031,96.400002,43.652802,870.0,2.3,485.98999,61.371887,96.400002
1528969200,6478.660156,3.258786,96.400002,8.16,870.320007,9.255514,485.98999,42.687656,96.400002
1528969260,6478.660156,1.970352,96.400002,20.4259,870.650024,2.7956,486.0,97.693878,96.449997


In [27]:
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))
main_df.head(10)

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,BCH-USD_close,BCH-USD_volume,ETH-USD_close,ETH-USD_volume,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1528968720,6487.379883,7.706374,96.660004,314.387024,870.859985,26.856577,486.01001,26.019083,96.389999,0
1528968780,6479.410156,3.088252,96.57,77.129799,870.099976,1.1243,486.0,8.4494,96.519997,0
1528968840,6479.410156,1.4041,96.5,7.216067,870.789978,1.749862,485.75,26.994646,96.440002,0
1528968900,6479.97998,0.753,96.389999,524.539978,870.0,1.6805,486.0,77.355759,96.470001,1
1528968960,6480.0,1.4909,96.519997,16.991997,869.98999,1.669014,486.0,7.5033,96.400002,0
1528969020,6477.220215,2.73195,96.440002,95.524078,869.450012,0.8652,485.98999,85.877251,96.400002,0
1528969080,6480.0,2.17424,96.470001,175.205307,869.98999,23.534929,485.98999,160.915192,96.400002,0
1528969140,6479.990234,0.9031,96.400002,43.652802,870.0,2.3,485.98999,61.371887,96.400002,0
1528969200,6478.660156,3.258786,96.400002,8.16,870.320007,9.255514,485.98999,42.687656,96.400002,0
1528969260,6478.660156,1.970352,96.400002,20.4259,870.650024,2.7956,486.0,97.693878,96.449997,1


In [31]:
times = sorted(main_df.index.values)
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]  # get the last 5% of the times

validation_main_df = main_df[(main_df.index >= last_5pct)]  # make the validation data where the index is in the last 5%
main_df = main_df[(main_df.index < last_5pct)]  # now the main_df is all the data up to the last 5%
main_df.head()


Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,BCH-USD_close,BCH-USD_volume,ETH-USD_close,ETH-USD_volume,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1528968720,6487.379883,7.706374,96.660004,314.387024,870.859985,26.856577,486.01001,26.019083,96.389999,0
1528968780,6479.410156,3.088252,96.57,77.129799,870.099976,1.1243,486.0,8.4494,96.519997,0
1528968840,6479.410156,1.4041,96.5,7.216067,870.789978,1.749862,485.75,26.994646,96.440002,0
1528968900,6479.97998,0.753,96.389999,524.539978,870.0,1.6805,486.0,77.355759,96.470001,1
1528968960,6480.0,1.4909,96.519997,16.991997,869.98999,1.669014,486.0,7.5033,96.400002,0


In [32]:
validation_main_df.head()

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,BCH-USD_close,BCH-USD_volume,ETH-USD_close,ETH-USD_volume,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1534379100,6261.0,5.914241,54.990002,146.135941,510.5,0.709851,280.269989,38.514919,54.98,0
1534379160,6259.930176,0.866465,54.990002,23.640104,509.649994,13.096176,280.730011,385.689606,55.080002,1
1534379220,6253.02002,2.913356,54.98,83.625984,509.850006,5.056426,279.730011,70.71376,54.91,0
1534379280,6253.02002,5.429587,54.98,32.583164,509.850006,0.388667,279.670013,29.253361,54.900002,0
1534379340,6259.029785,2.278883,55.080002,501.737061,510.309998,2.479431,279.899994,35.823578,54.950001,0


In [41]:
from sklearn import preprocessing

def preprocess_df(df):
    for col in df.columns:
        if col != "target":
            df[col] = df[col].pct_change()
            df.dropna(inplace=True)
            df[col] = preprocessing.scale(df[col].values)
    df.dropna(inplace=True)
    return df.drop("target", axis=1), df["target"]  # Split features and target

train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)
print(train_x.head())

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528970880       0.001289        0.002191       0.001072        0.003467   
1528970940       0.001185        0.005039       0.000896        0.004873   
1528971000      -0.455177        0.001166       0.001063        0.004700   
1528971060       0.000811        0.014463       0.001078        0.004658   
1528971120      -0.001178       -0.001337       0.001042        0.004548   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  \
time                                                                       
1528970880       0.004260        0.004637      -0.005072        0.001339   
1528970940       0.021512        0.004907      -0.005844        0.001459   
1528971000       0.002695        0.004818       0.130324        0.001316   
1528971060      -0.004605        0.004845      -0.005023        0.001478   
1528971120 

In [46]:

from collections import deque
import random

# Assume SEQ_LEN and df are defined earlier (e.g., SEQ_LEN = 60, df from preprocess_df)
def create_sequences(df, SEQ_LEN):
    sequential_data = []  # List to contain the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # Sliding window of SEQ_LEN steps
    
    for i in df.values:  # Iterate over DataFrame rows as arrays
        prev_days.append([n for n in i[:-1]])  # Add features (all but target)
        if len(prev_days) == SEQ_LEN:  # When we have SEQ_LEN steps
            sequential_data.append([np.array(prev_days), i[-1]])  # Pair sequence with target
    
    random.shuffle(sequential_data)  # Shuffle sequences
    return sequential_data  # Return the list for further use

# Example usage (assuming df exists)
# SEQ_LEN = 60  # Define this if not already set
# sequential_data = create_sequences(df, SEQ_LEN)
# print(f"Number of sequences: {len(sequential_data)}")
# print(f"First sequence shape: {sequential_data[0][0].shape}, Target: {sequential_data[0][1]}")

In [53]:
import random

def balance_sequences(sequential_data):
    buys = []  # List for buy sequences and targets (target = 1)
    sells = []  # List for sell sequences and targets (target = 0)
    
    for seq, target in sequential_data:  # Iterate over sequential data
        if target == 0:  # If it's a "not buy"
            sells.append([seq, target])  # Append to sells list
        elif target == 1:  # If it's a "buy"
            buys.append([seq, target])  # Append to buys list
    
    random.shuffle(buys)  # Shuffle the buys
    random.shuffle(sells)  # Shuffle the sells
    
    lower = min(len(buys), len(sells))  # Find the shorter length
    
    buys = buys[:lower]  # Trim buys to match shorter length
    sells = sells[:lower]  # Trim sells to match shorter length
    
    balanced_data = buys + sells  # Combine them
    random.shuffle(balanced_data)  # Shuffle the combined list
    
    return balanced_data  # Return the balanced sequences

# Example usage (assuming sequential_data exists)
# sequential_data = balance_sequences(sequential_data)
# print(f"Total sequences: {len(sequential_data)}")
# print(f"First sequence shape: {sequential_data[0][0].shape}, Target: {sequential_data[0][1]}")

In [55]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

train_x shape: (92825, 9)
train_y shape: (92825,)
validation_x shape: (4877, 9)
validation_y shape: (4877,)


In [56]:
print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

train data: 92825 validation: 4877


TypeError: count() takes 1 positional argument but 2 were given