In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import itertools
import mplfinance as mpf
import re
import math
from math import floor
import sys
import os

import dask

from functions import *
from functions.preprocessing import *
from functions.swing_points import *
from functions.fvg import *

quandl_api_key = '_umNYuQHdkCgs9Rcm4Fv'

In [27]:
df_es = pd.read_csv('es-1m_bk.csv', sep=';', names=['date', 'time', 'open', 'high', 'low', 'close', 'vol'])
df_es['datetime'] = pd.to_datetime(df_es.date + ' ' + df_es.time, format='%d/%m/%Y %H:%M')
df_es = df_es.infer_objects()
df_es.set_index(df_es.datetime, inplace=True)
df_es.index = df_es.index.tz_localize('America/Chicago').tz_convert('America/New_York')
df_es.drop(columns=['date', 'time', 'datetime', 'vol'], inplace=True)
df_es = df_es[~df_es.index.duplicated(keep='last')].astype('float32')

df_nq = pd.read_csv('nq-1m_bk.csv', sep=';', names=['date', 'time', 'open', 'high', 'low', 'close', 'vol'])
df_nq['datetime'] = pd.to_datetime(df_nq.date + ' ' + df_nq.time, format='%d/%m/%Y %H:%M')
df_nq = df_nq.infer_objects()
df_nq.set_index(df_nq.datetime, inplace=True)
df_nq.index = df_nq.index.tz_localize('America/Chicago').tz_convert('America/New_York')
df_nq.drop(columns=['date', 'time', 'datetime', 'vol'], inplace=True)
df_nq = df_nq[~df_nq.index.duplicated(keep='last')].astype('float32')

In [4]:
test = False
if test:
    df_es = pd.read_csv('EP_test_set.csv', sep='\t')
    df_nq = pd.read_csv('ENQ_test_set.csv', sep='\t')
    df_es = preprocess_dataframe(df_es, log_returns=False)
    df_nq = preprocess_dataframe(df_nq, log_returns=False)
else:    
    df_es = pd.read_csv('/Users/kush/Desktop/futures_data/EP.csv', low_memory=True)
    df_nq = pd.read_csv('/Users/kush/Desktop/futures_data/ENQ.csv', low_memory=True)
    
    rows = 100000000#max(len(df_es), len(df_nq)) // 2
    df_es = df_es[-rows:]
    df_nq = df_nq[-rows:]
    
    df_es = preprocess_dataframe(df_es, log_returns=False)
    df_nq = preprocess_dataframe(df_nq, log_returns=False)


In [28]:
df_list = []
global past_swings

for df in [df_es, df_nq]:
    log_returns=False

    # Step 1: Get swing points for each window
    lookback = 20
    timeframes = ['1T', '5T', '15T', '60T', '240T', '1440T']
    print(f'Finding swing points')
    df = identify_swing_points_optimized(df, timeframes, interpolation_method='None')

    # Step 2: Get a list of the past {lookback} swing points for each time point
    print(f'Getting past {lookback} swings into list')
    df = swings_to_features_optimized(df, lookback, log_returns)
    
    # Step 4: Drop columns from Step 1 and Step 2
    print('Dropping unnecesary swings columns')
    regex_swings = r'^swing_high_\d{1,4}[TDW]$|^swing_low_\d{1,4}[TDW]$'
    pivot_cols_to_drop = [c for c in df.columns if re.match(regex_swings, c)]
    df = df.drop(columns=pivot_cols_to_drop)
    
    # convert cols dtype to save memory
    df.fillna(0, inplace=True)
    print('Converting dtypes')
    cols_to_float32 = [c for c in df.columns if any(s in c for s in ['open', 'high', 'low', 'close'])]
    df[cols_to_float32] = df[cols_to_float32].astype('float32')

    # Step 5: Get Fair Value Gaps for each timeframe
    print('Getting Fair Value Gaps')
    timeframes_fvg = ['1T', '5T', '15T', '60T', '240T', '1440T']
    df = identify_fair_value_gaps_optimized(df, timeframes_fvg)

    # Step 6: Get list of past {lookback} FVGs
    print(f'Getting past {lookback} FVGs')
    df = fvg_to_feature_optimized(df, lookback, log_returns)
    
     # Step 8:   Drop columns from Step 6 and Step 6
    print('Dropping unnecesary FVG columns')
    regex_fvg = r'^fair_value_gap_\d{1,4}[TDW]$|^fair_value_gap_\d{1,4}[TDW]_high$|^fair_value_gap_\d{1,4}[TDW]_low$'
    fvg_cols_to_drop = [c for c in df.columns if re.match(regex_fvg, c)]
    df = df.drop(columns=fvg_cols_to_drop)

    # convert cols dtype to save memory
    df.fillna(0, inplace=True)
    print('Converting dtypes')
    cols_to_float32 = [c for c in df.columns if any(s in c for s in ['open', 'high', 'low', 'close'])]
    df[cols_to_float32] = df[cols_to_float32].astype('float32')
    
    cols_to_uint16 = [c for c in df.columns if re.match(r'^fair_value_gap_\d{1,4}[TDW]_\d+$', c)]
    df[cols_to_uint16] = df[cols_to_uint16].astype('uint16')

    print('Adding DF to list \n')
    df_list.append(df)

Finding swing points
Getting past 20 swings into list
Dropping unnecesary swings columns
Converting dtypes
Getting Fair Value Gaps
Getting past 20 FVGs
Dropping unnecesary FVG columns
Converting dtypes
Adding DF to list 

Finding swing points
Getting past 20 swings into list
Dropping unnecesary swings columns
Converting dtypes
Getting Fair Value Gaps
Getting past 20 FVGs
Dropping unnecesary FVG columns
Converting dtypes
Adding DF to list 



# DATAFRAME IS READY, NEXT STEP IS TO DO THE MACHINE LEARNING
# TRY IT LOCALLY THEN GO TO SAGEMAKER (DEPENDING ON HOW LOCALLY GOES)

In [36]:
#df_list[0]
df_list[0].select_dtypes(include='uint16').columns.tolist()

['fair_value_gap_1T_1',
 'fair_value_gap_1T_2',
 'fair_value_gap_1T_3',
 'fair_value_gap_1T_4',
 'fair_value_gap_1T_5',
 'fair_value_gap_1T_6',
 'fair_value_gap_1T_7',
 'fair_value_gap_1T_8',
 'fair_value_gap_1T_9',
 'fair_value_gap_1T_10',
 'fair_value_gap_1T_11',
 'fair_value_gap_1T_12',
 'fair_value_gap_1T_13',
 'fair_value_gap_1T_14',
 'fair_value_gap_1T_15',
 'fair_value_gap_1T_16',
 'fair_value_gap_1T_17',
 'fair_value_gap_1T_18',
 'fair_value_gap_1T_19',
 'fair_value_gap_1T_20',
 'fair_value_gap_5T_1',
 'fair_value_gap_5T_2',
 'fair_value_gap_5T_3',
 'fair_value_gap_5T_4',
 'fair_value_gap_5T_5',
 'fair_value_gap_5T_6',
 'fair_value_gap_5T_7',
 'fair_value_gap_5T_8',
 'fair_value_gap_5T_9',
 'fair_value_gap_5T_10',
 'fair_value_gap_5T_11',
 'fair_value_gap_5T_12',
 'fair_value_gap_5T_13',
 'fair_value_gap_5T_14',
 'fair_value_gap_5T_15',
 'fair_value_gap_5T_16',
 'fair_value_gap_5T_17',
 'fair_value_gap_5T_18',
 'fair_value_gap_5T_19',
 'fair_value_gap_5T_20',
 'fair_value_gap_1

In [12]:
# combine dataset
df = pd.merge(df_list[0], df_list[1], 'inner', 'datetime', suffixes=('_es', '_nq'))
#del(df_es)
#del(df_nq)

# extract year, month, day, hour, minute, and day of week as separate reatures.
df = df.assign(
    year=df.index.year,
    month=df.index.month,
    day=df.index.day,
    date=df.index.date,
    hour=df.index.hour,
    minute=df.index.minute,
    day_of_week=df.index.dayofweek,
    day_of_the_week=df.index.dayofweek #pd.get_dummies (below) removes the 'day_of_week' column to one-hot-encode, but we still need it for the df_filtered code.
)
df['day_of_the_week'] = df['day_of_week']
df = pd.get_dummies(df, 'day_of_week', columns=['day_of_week'])





# dataframe is fragemented, t de-fragment it use copy.
#df = df.copy(deep=True)

KeyboardInterrupt: 

In [None]:
##### CALUCLATE NWOGS and get add past 5 to each time point #####

# Resample to daily frequency and backfill the OHLC into the weekends.
daily_df = df.resample('D').agg({'open_es': 'first', 'high_es' : 'max', 'low_es' : 'min', 'close_es': 'last'}).bfill()

# Identify closing prices on Fridays
friday_closes = daily_df[daily_df.index.dayofweek == 4]['close_es']

# Identify opening prices on Sundays
# since we backfilled, the next market opening price will be the opening price on saturday. shift it up one to match the friday index
# then we have the friday close and next market open on the same index.
sunday_opening = daily_df['open_es'].shift(-1) #
sunday_opening = sunday_opening[sunday_opening.index.dayofweek == 4]

# convert to dataframe and get log returns
if log_returns:
    nwog_df = pd.DataFrame({'NWOG_close': np.log(friday_closes / friday_closes.shift(1)), 'NWOG_open': np.log(sunday_opening / sunday_opening.shift(1))})
else:
    nwog_df = pd.DataFrame({'NWOG_close': friday_closes, 'NWOG_open': sunday_opening})

# Create columns for the past 5 closes and openings
for i in range(1, 6):
    nwog_df[f'NWOG_close_{i}'] = nwog_df['NWOG_close'].shift(i)
    nwog_df[f'NWOG_open_{i}'] = nwog_df['NWOG_open'].shift(i)

# Drop the original 'NWOG_close' and 'NWOG_open' columns
nwog_df.drop(columns=['NWOG_close', 'NWOG_open'], inplace=True)

df = df.join(nwog_df, how='left').fillna(method='ffill')

In [None]:
# GET DAILY HIGH AND LOW BEFORE LUNCH

# Filter data, calculate daily high and low and their times, and merge back to original DataFrame
filtered_df = df[df.day_of_the_week < 5].between_time('09:30', '11:59')
morning_high_low = filtered_df.groupby('date').agg({'high_es': 'max', 'low_es': 'min'})
morning_high_low.columns = ['Morning_High', 'Morning_Low']

# Get Daily High and Low
filtered_df = df[df.day_of_the_week < 5].between_time('09:30', '15:59')
daily_high_low = filtered_df.groupby('date').agg({'high_es': 'max', 'low_es': 'min'})
daily_high_low.columns = ['Daily_High', 'Daily_Low']
morning_high_low = morning_high_low.join(daily_high_low)

# Get Daily Midnight Open
morning_high_low = morning_high_low.join(df[(df.hour == 0) & (df.minute == 0)][['open_es', 'date']].set_index('date')).rename({'open_es' : 'Midnight_Open'}, axis='columns')

df = df.merge(morning_high_low, how='left', on='date')

# GET ONLY 0929 TIMEPOINTS FOR EACH DAY
df_final = df[(df.hour == 9) & (df.minute == 29)].reset_index().dropna()

# convert daily high and low and midnight variables to log return of the close price at 9:29
#df_final[['Daily_High', 'Daily_Low']] = df_final[['Daily_High', 'Daily_Low']].div(df_final.close_es, axis=0).apply(np.log)
#df_final[['Morning_High', 'Morning_Low']] = df_final[['Morning_High', 'Morning_Low']].div(df_final.close_es, axis=0).apply(np.log)
#df_final['Midnight_Open'] = df_final['Midnight_Open'].div(df_final.close_es, axis=0).apply(np.log)

# convert the rest of the price columns to log returns of the close price at 9:29
price_cols = [c for c in df_final.columns if any(s in c for s in ['swing', 'fair', 'NWOG'])]
df_final[price_cols] = df_final[price_cols].div(df_final.close_es, axis=0).apply(np.log)


# convert OHLC log returns based on previous day's 0929 candle
#df_final = calculate_log_returns(df_final, ['open_es', 'high_es', 'low_es', 'close_es'])
#df_final = calculate_log_returns(df_final, ['open_nq', 'high_nq', 'low_nq', 'close_nq'])


# Transform 'month' variable (max value is 12)
df_final['month_sin'] = np.sin(2 * np.pi * df_final['month'] / 12)
df_final['month_cos'] = np.cos(2 * np.pi * df_final['month'] / 12)

# Transform 'day' variable (max value is 31)
df_final['day_sin'] = np.sin(2 * np.pi * df_final['day'] / 31)
df_final['day_cos'] = np.cos(2 * np.pi * df_final['day'] / 31)

# Drop unnecesary columns
df_final.drop(columns=['day_of_week_6', 'day_of_the_week', 'date', 'hour', 'minute', 'index'], inplace=True)

In [24]:
df_final.to_csv('df_final.csv', index=False)

In [None]:
df_final.info()