## sentdex - Deep Learning Pt.10

<h3> Goal : Balancing RNN sequence data</h3>
<hr>

In [4]:
# With 60 minutes, predict future 3 minutes

SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 3
RATIO_TO_PREDICT = "LTC-USD"

def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0
    
import pandas as pd
import os

main_df = pd.DataFrame()

ratios = ["BTC-USD", "LTC-USD", "ETH-USD", "BCH-USD"]
for ratio in ratios:
    dataset = f"crypto_data/{ratio}.csv"
    
    df = pd.read_csv(dataset,
                     names = ["time", "low", "high", "open", "close", "volume"])
    df.rename(columns = {"close" : f"{ratio}_close", "volume" : f"{ratio}_volume"}, inplace = True)
    
    df.set_index("time", inplace = True)
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]
    
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)
        
# "future" value gonna be 3-days-shifted value of "close" coluomn
main_df["future"] = main_df[ f"{RATIO_TO_PREDICT}_close" ].shift(-FUTURE_PERIOD_PREDICT)        

# Classify! (Not ML tho)
main_df['target'] = list(map(classify, main_df[ f"{RATIO_TO_PREDICT}_close"], main_df["future"]))

print(main_df[ [ f"{RATIO_TO_PREDICT}_close", "future", "target"] ].head(10))

            LTC-USD_close     future  target
time                                        
1528968660      96.580002  96.500000       0
1528968720      96.660004  96.389999       0
1528968780      96.570000  96.519997       0
1528968840      96.500000  96.440002       0
1528968900      96.389999  96.470001       1
1528968960      96.519997  96.400002       0
1528969020      96.440002  96.400002       0
1528969080      96.470001  96.400002       0
1528969140      96.400002  96.400002       0
1528969200      96.400002  96.400002       0


In [5]:
from sklearn import preprocessing
from collections import deque
import random
import numpy as np

def preprocess_df(df):
    
    # "future" column is not needed
    df = df.drop("future", axis = 1)
    
    # scaling
    for col in df.columns:
        if col != "target":
            
            df[col] = df[col].pct_change()
            df.dropna(inplace = True)
            df[col] = preprocessing.scale(df[col].values)
    
    # just in cases
    df.dropna(inplace = True)
    
    
    sequential_data = []
    prev_mins = deque(maxlen = SEQ_LEN)
    
    for i in df.values:  # df.values: each row of df
        prev_mins.append([n for n in i[:-1]])  # why i[:-1]? : Not to include target class
        
        if len(prev_mins) == SEQ_LEN:
            sequential_data.append([np.array(prev_mins), i[-1]])  # past 60 days (including _day (i) ) + target value of _day(i)
            
    random.shuffle(sequential_data)
    
    ########################################################################
    ########################################################################
    
    # For balancing!
    # Separate buys / dont buys
    buys, sells = [], []
    
    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])
            
    # Shuffle
    random.shuffle(buys)
    random.shuffle(sells)
    
    lower = min(len(buys), len(sells))
    
    # Resize to the lower size
    buys = buys[:lower]
    sells = sells[:lower]
    
    # Concatenate
    sequential_data = buys + sells
    random.shuffle(sequential_data)
    
    # Split data into X, y
    X = []
    y = []
    
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
    
    return np.array(X), y

In [6]:
times = sorted(main_df.index.values)
last_5pct = times[-int(0.05 * len(times))] # threshold

validation_main_df = main_df[ (main_df.index >= last_5pct)] # validation (5%)
main_df = main_df[ (main_df.index < last_5pct)] # the rest

train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

train data: 69188 validation: 3062
Dont buys: 34594, buys: 34594
VALIDATION Dont buys: 1531, buys: 1531


Now, same data size for each target (buy / dont buy)!