## Dataset Preprocessing

we will use the BTC-USD, LTC-USD, ETH-USD and BCH-USD datasets to train our model, however it is necessary to perform
data preprocessing before actually feed our model 

In [2]:
import pandas as pd

# Test if we can actually read the data from the dataset folder:

dataset = pd.read_csv("Datasets/BTC-USD.csv", names=['time', 'low', 'high', 'open', 'close', 'volume'])
print(dataset.head())

         time          low         high         open        close    volume
0  1528968660  6489.549805  6489.560059  6489.560059  6489.549805  0.587100
1  1528968720  6487.370117  6489.560059  6489.549805  6487.379883  7.706374
2  1528968780  6479.410156  6487.370117  6487.370117  6479.410156  3.088252
3  1528968840  6479.410156  6479.419922  6479.419922  6479.410156  1.404100
4  1528968900  6475.930176  6479.979980  6479.410156  6479.979980  0.753000


In [7]:
# Define a Function to select only "close" and "volume" columns
# then merge in a unique dataframe
ratios = ["BTC-USD", "BCH-USD", "ETH-USD", "LTC-USD"]
path = "Datasets/"


def merge_datasets(ratios, path):
    dataset_merge = pd.DataFrame()
    # Read datasets from the path
    for ratio in ratios:
        #Set the path to each cryptocurency dataset and read them:
        dataset_path = path+f'{ratio}.csv'
        dataset = pd.read_csv(dataset_path, names=['time', f'{ratio}_low', f'{ratio}_high', f'{ratio}_open', f'{ratio}_close', f'{ratio}_volume'])
        
        # take only "close" and "volume" columns from each crypto dataset:
        dataset.set_index("time", inplace=True) #since time is a common value for each dataset, we define it as the index 
        dataset_close_vol = dataset[[f'{ratio}_close', f'{ratio}_volume']] # take close and volume
        
        # merge values:
        if len(dataset_merge)==0:
            dataset_merge = dataset_close_vol
        else:
            dataset_merge = dataset_merge.join(dataset_close_vol)

    dataset_merge.fillna(method="ffill", inplace=True) #fill gaps with the previous know value
    dataset_merge.dropna(inplace=True)
    return dataset_merge
    
        

In [8]:
# Verify if the function works as expected:
main_dataset = merge_datasets(ratios,path)
print(main_dataset.head())


            BTC-USD_close  BTC-USD_volume  BCH-USD_close  BCH-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374     870.859985       26.856577   
1528968780    6479.410156        3.088252     870.099976        1.124300   
1528968840    6479.410156        1.404100     870.789978        1.749862   
1528968900    6479.979980        0.753000     870.000000        1.680500   
1528968960    6480.000000        1.490900     869.989990        1.669014   

            ETH-USD_close  ETH-USD_volume  LTC-USD_close  LTC-USD_volume  
time                                                                      
1528968720      486.01001       26.019083      96.660004      314.387024  
1528968780      486.00000        8.449400      96.570000       77.129799  
1528968840      485.75000       26.994646      96.500000        7.216067  
1528968900      486.00000       77.355759      96.389999      524.539978  
1528968960      4

as we can see, we already sucessfully merge the data from the different crypocurrencies datasets, know we need to perform 
define our inputs and targets, and then apply some normalization to the input data

## Target Definition

Since RNN uses sequential data, we need to prepare our target vector considering the length of the sequence that we will use 
as input, the cryptocurrency that we want to predict and finally the temporary window behaviour (up/down) that we want to predict

In [9]:
# Variables used to create a target vector
SEQUENCE_LENGTH = 60 # Take the previous 60 instances to make predictions
FUTURE_PERIOD_PREDICTION = 3 # we want to predict the cryptocurrency behaviour in a period of 3 min
RATIO_TO_PREDICT = "BTC-USD"

now, it is necessary to define a function that take the current value and the future value of the crypto and then
set a binary value of 1 when we should buy and 0 when we have to sell our asset

In [10]:
def buy_sell_classification(current_value, future_value):
    if float(future_value) > float(current_value):
        return 1
    else:
        return 0

In [11]:
def build_target_vector(dataset, ratio_to_predict, future_period_prediction):
    # create a new column with future values:
    dataset[f'future_{ratio_to_predict}'] = dataset[f'{ratio_to_predict}_close'].shift(-future_period_prediction)
    
    # create a target column
    dataset[f'target_{ratio_to_predict}'] = list(map(buy_sell_classification, dataset[f'{ratio_to_predict}_close'], dataset[f'future_{ratio_to_predict}']))
    
    return dataset

In [14]:
main_dataset_with_target = build_target_vector(main_dataset, RATIO_TO_PREDICT, FUTURE_PERIOD_PREDICTION)

# Verify values:
print(main_dataset_with_target[[f'{RATIO_TO_PREDICT}_close', f'future_{RATIO_TO_PREDICT}', f'target_{RATIO_TO_PREDICT}']].head(10))

            BTC-USD_close  future_BTC-USD  target_BTC-USD
time                                                     
1528968720    6487.379883     6479.979980               0
1528968780    6479.410156     6480.000000               1
1528968840    6479.410156     6477.220215               0
1528968900    6479.979980     6480.000000               1
1528968960    6480.000000     6479.990234               0
1528969020    6477.220215     6478.660156               1
1528969080    6480.000000     6478.660156               0
1528969140    6479.990234     6479.339844               0
1528969200    6478.660156     6479.350098               1
1528969260    6478.660156     6479.990234               1
