In [None]:
import os

import pandas as pd
import numpy as np


import plotly.express as px

In [None]:
def add_diff_from_var(df, columns, var):
    for ticker in ['BTC_', 'XRP_']:
        for column in columns:

            df[ticker+column+'_diff'] = df[ticker+var] -  df[ticker+column] 
        
    return df


def get_targets(
    df, ticker, look_ahead_length, threshold_0, threshold_1 
):
    price_varname = ticker+'_lastPrice'
    look_ahead_varname = f"percentage_change_after_{look_ahead_length}_points" 
    
    min_LAVN, max_LAVN = 'min_'+look_ahead_varname, 'max_'+look_ahead_varname
    
    targets = pd.DataFrame(columns=[min_LAVN, max_LAVN])
    
    prices = df[price_varname].values
    timestamps = df['timestamp'].values
    
    for i in range(len(prices)-look_ahead_length):
        target = {'timestamp': timestamps[i]}
        
        price_i = prices[i]
        target[max_LAVN] = (prices[i: i+look_ahead_length].max() - price_i)/price_i
        target[min_LAVN] = (price_i - prices[i: i+look_ahead_length].min())/price_i
        
        if target[max_LAVN] >= threshold_0 and target[min_LAVN] >= threshold_0:
            target['label'] = 5
            
        elif target[max_LAVN] >= threshold_1:
            target['label'] = 4
        
        elif target[min_LAVN] >= threshold_1:
            target['label'] = 3
            
        elif target[max_LAVN] >= threshold_0:
            target['label'] = 2
        
        elif target[min_LAVN] >= threshold_0:
            target['label'] = 1
            
        else:
            target['label'] = 0
        
        targets = targets.append(target, ignore_index=True)
        
    nans = np.zeros([look_ahead_length, len(targets.columns)])
    nans[:, :] = np.NaN
    targets = targets.append(pd.DataFrame(data=nans, columns=targets.columns), ignore_index=True)
        
    return targets
    
    
def plot_data(df, columns, ticker):
    ticker = ticker+'_'
    columns = [ticker+var for var in columns]
#     print(columns)
#     display(df[columns])
#     display(df['timestamp'])
    
    plt = px.line(
        df,
        x='timestamp',  # df.index, # 'timestamp',
        y=columns
    )
    plt.show()

In [None]:
def group_test_train_mask(group, train_size):
    group.iloc[:int(len(group)*train_size)]['is_test'] = 0
    
    return group


def add_test_train_mask(df, train_size):
    df['is_test'] = [1]*len(df)
    
    df = df.groupby(
        ['label']
    ).apply(group_test_train_mask, train_size=train_size)
    
    return df.sort_values(by='timestamp')

In [None]:
def add_day_and_hour(df):
    df['timestamp'] = pd.to_datetime(
        df['timestamp'], format="%m:%d:%Y %H:%M:%S"
    )
    df['hour'] = df['timestamp'].dt.hour
    df['day'] = df['timestamp'].dt.dayofweek
    
    return df


def add_moving_avgs(df, movings_avgs, varnames):
    for ticker in ['BTC_', 'XRP_']:
        for avg in movings_avgs:
            for varname in varnames:
                
                df[ticker+str(avg)+'_ema_'+varname] = df[ticker+varname].ewm(span=avg).mean()
            
    return df

# def add_spreads(df, ema_prefix=None):
#     for ticker in ['BTC_', 'XRP_']:
#         df[ticker+'quant_spread'] = df[ticker+'askQty'] - df[ticker+'bidQty'] 
#         df[ticker+'spread'] = df[ticker+'askPrice'] - df[ticker+'bidPrice'] 
        
#         if ema_prefix is not None:
#             df[ticker+'avg_quant_spread'] = df[ticker+ema_prefix+'_askQty'] - df[ticker+ema_prefix+'_bidQty'] 
        
#     return df

  
def add_spreads(df):
    for ticker in ['BTC_', 'XRP_']:
        df[ticker+'quant_spread'] = df[ticker+'askQty'] - df[ticker+'bidQty'] 
        df[ticker+'spread'] = df[ticker+'askPrice'] - df[ticker+'bidPrice'] 
        
#         if ema_prefix is not None:
#             df[ticker+'avg_quant_spread'] = df[ticker+ema_prefix+'_askQty'] - df[ticker+ema_prefix+'_bidQty'] 
        
    return df

### Featurization Ideas:
* Train a model taking the volumes to try to predict the hour in the day / day of the week.
    * Take the final layer (extracted vector) from this model as extra features to price prediction model
* Train a model taking the hour of the day as input (maybe one hot encoded) & tries to predict the volume, use this to have a condesed encoded hour of day/ day of week.

### Moving Averages 

* Each row is 12 seconds so 5 rows = 1 minute
* a 25 moving avg is a 5 minute moving avg
* a 50 moving avg is a 10 minute moving avg
* a 100 moving avg is a 20 minute moving avg
* a 200 moving avg is a 40 minute moving avg
* a 1000 moving avg is 3.33 hrs

In [None]:
dir_path = "../data/streams/XRPEUR/numeric/"
data_filename = "12_second_interval_08_05_2021__22_54.csv"

movings_avgs = [50, 200, 1000] #25, 100

# 'highPrice', 'lowPrice', 'openPrice', 'prevClosePrice',
add_diff_columns = [
    'askPrice', 'bidPrice', 'weightedAvgPrice'
] + [str(avg)+'_ema_lastPrice' for avg in movings_avgs]
    
# get & preprocess data
data = pd.read_csv(
    dir_path+data_filename, index_col=0
).drop(['BTC_closeTime', 'XRP_closeTime'], axis=1)

data = add_day_and_hour(data)
data = add_spreads(data)
data = add_moving_avgs(data, movings_avgs, ['lastPrice'])
data = add_diff_from_var(data, add_diff_columns, 'lastPrice')
data = add_moving_avgs(data, [500], ['quant_spread', 'spread', 'askPrice_diff', 'bidPrice_diff'])

# data = add_diff_from_var(data, ['500_ema_quant_spread'], 'quant_spread')
# data = add_diff_from_var(data, ['500_ema_spread'], 'spread')

# create targets
targets = get_targets(
    data, "XRP", look_ahead_length=650, threshold_0=0.007, threshold_1=0.02
)

targets = add_test_train_mask(targets, train_size=0.8)

booo_columns = [
    'BTC_lowPrice', 'BTC_highPrice', 'BTC_openPrice', 'BTC_prevClosePrice',
    'XRP_highPrice', 'XRP_lowPrice', 'XRP_openPrice', 'XRP_prevClosePrice',
    'timestamp', 'day'
]
data = data.drop(booo_columns, axis=1)

# save data
targets.to_csv(dir_path+"/preprocessed/targets_"+data_filename)
data.to_csv(dir_path+"/preprocessed/data_"+data_filename)


### SMOOTH DATA !
smooth_span = 5 

to_smooth_columns = [
    col for col in data.columns[~data.columns.str.contains('ema')] 
    if col not in ['hour', 'BTC_weightedAvgPrice', 'XRP_weightedAvgPrice']
]

for var in to_smooth_columns:
    data[var] = data[var].ewm(span=smooth_span).mean()
    
data.to_csv(dir_path+"/preprocessed/smooth_data_"+data_filename)

In [None]:
#targets[targets['label'] == 2] 

In [None]:
columns = [
    'bidQty',
    'askQty',
]

plot_data(data, columns, 'XRP')


columns = [
    '500_ema_quant_spread',
    'quant_spread'
]

plot_data(data, columns, 'XRP')

In [None]:
columns = [
    'weightedAvgPrice',
    'lastPrice',
    'askPrice',
    'bidPrice',
    '50_ema_lastPrice',
    '200_ema_lastPrice',
    '1000_ema_lastPrice'
]

columns_diff = [
    'weightedAvgPrice_diff',
    'askPrice_diff',
    'bidPrice_diff',
    '50_ema_lastPrice_diff',
    '200_ema_lastPrice_diff',
    '1000_ema_lastPrice_diff',
    'spread',
    '500_ema_spread'
]

plot_data(data, columns, 'XRP')
plot_data(data, columns_diff, 'XRP')

plot_data(data, columns, 'BTC')
plot_data(data, columns_diff, 'BTC')