In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import itertools
import mplfinance as mpf
import re
import math
from math import floor
import sys
import os

from functions import *
from functions.preprocessing import *
from functions.swing_points import *
from functions.fvg import *

quandl_api_key = '_umNYuQHdkCgs9Rcm4Fv'

In [3]:
test = False
if test:
    df_es = pd.read_csv('EP_test_set.csv', sep='\t')
    df_nq = pd.read_csv('ENQ_test_set.csv', sep='\t')
    df_es = preprocess_dataframe(df_es, log_returns=False)
    df_nq = preprocess_dataframe(df_nq, log_returns=False)
else:    
    df_es = pd.read_csv('/Users/kush/Desktop/futures_data/EP.csv', low_memory=True)
    df_nq = pd.read_csv('/Users/kush/Desktop/futures_data/ENQ.csv', low_memory=True)
    
    rows = 100000000#max(len(df_es), len(df_nq)) // 2
    df_es = df_es[-rows:]
    df_nq = df_nq[-rows:]
    
    df_es = preprocess_dataframe(df_es, log_returns=False)
    df_nq = preprocess_dataframe(df_nq, log_returns=False)


In [30]:
df_list = []
for df in [df_es, df_nq]:
    lookback = 20
    timeframes = ['1T', '5T', '15T', '1H', '4H', '1B']
    timeframes_fvg = ['1T', '5T', '15T', '1H', '4H', '1B']
    df = identify_swing_points_optimized(df, timeframes, get_swing_values=False, interpolation_method='None')
    df = identify_fair_value_gaps_optimized(df, timeframes_fvg, get_high_low=False)
    df_list.append(df)

In [31]:
df = pd.merge(df_list[0], df_list[1], 'inner', 'datetime', suffixes=('_es', '_nq'))

In [32]:
df = df.assign(
    month=df.index.month,
    day=df.index.day,
    hour=df.index.hour,
    minute=df.index.minute,
    day_of_week=df.index.dayofweek.astype('int8')
)

def add_cyclic_date_features(df, column, max_val):
    df[f'{column}_sin'] = np.sin(2 * np.pi * df[column] / max_val).astype('float32')
    df[f'{column}_cos'] = np.cos(2 * np.pi * df[column] / max_val).astype('float32')
    df.drop(columns=column, inplace=True)

add_cyclic_date_features(df, 'month', 12)
add_cyclic_date_features(df, 'day', 31)
add_cyclic_date_features(df, 'hour', 23)
add_cyclic_date_features(df, 'minute', 60)
add_cyclic_date_features(df, 'day_of_week', 6)


In [33]:
# Get weekly open and closes and midnight open
df['weekly_open'] = df.open_es.resample('W', label='left').first().reindex(df.index, method='ffill')
df['weekly_close'] = df.close_es.resample('W', label='left').first().shift(1).reindex(df.index, method='ffill')
df['daily_open'] = df.open_es.resample('D', label='left').first().reindex(df.index, method='ffill')

In [34]:
price_cols = df.columns[(df>1).any()]
cyclical_cols = [c for c in df.columns if re.match(r'^.*(_sin|_cos)', c)]
swing_cols = [c for c in df.columns if 'swing' in c]
fvg_cols = [c for c in df.columns if 'fair' in c]


In [9]:
from tsfracdiff import FractionalDifferentiator
s = FractionalDifferentiator()

df.loc[:, price_cols] = s.FitTransform(df[price_cols])

In [10]:
# convert to appropriate dtypes
df = df.fillna(0).convert_dtypes()
df[swing_cols] = df[swing_cols].astype('bool')
df = df.astype({col: 'float32' for col in df.select_dtypes(include='float64').columns})
df = df.astype({col: 'int8' for col in df.select_dtypes(include='int64').columns})

In [99]:
df.to_parquet('df_LSTM.parquet', index=True)