In [1]:
import boto3
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

### independent variables

In [2]:
# spy_1m_df_loc = f'{s3_prefix}{bucket}/{primary_folder}/data/data_prep/stock_bars_1min.parquet'
spy_1m_df_loc = 's3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_1min_base.parquet'
spy_1m_df = pd.read_parquet(spy_1m_df_loc)
spy_1m_df = spy_1m_df[spy_1m_df.symbol=='SPY']

In [3]:
spy_1m_df.head()

Unnamed: 0_level_0,open,high,low,close,volume,symbol,time_delta,open_ema_2m,open_sma_2m,high_ema_2m,...,open_ema_26m,open_sma_26m,high_ema_26m,high_sma_26m,low_ema_26m,low_sma_26m,close_ema_26m,close_sma_26m,volume_ema_26m,volume_sma_26m
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-02 04:00:00,476.25,476.36,476.0,476.31,20460.0,SPY,,476.25,,476.36,...,476.25,,476.36,,476.0,,476.31,,20460.0,
2024-01-02 04:01:00,476.34,476.34,476.29,476.29,6369.0,SPY,,476.31,476.295,476.346667,...,476.256667,,476.358519,,476.021481,,476.308519,,19416.222222,
2024-01-02 04:02:00,476.29,476.29,476.28,476.28,6152.0,SPY,,476.296667,476.315,476.308889,...,476.259136,,476.353443,,476.040631,,476.306406,,18433.687243,
2024-01-02 04:03:00,476.27,476.27,476.27,476.27,369.0,SPY,,476.278889,476.28,476.282963,...,476.259941,,476.347262,,476.057621,,476.303709,,17095.562262,
2024-01-02 04:04:00,476.27,476.27,476.27,476.27,369.0,SPY,,476.272963,476.27,476.274321,...,476.260686,,476.341539,,476.073353,,476.301212,,15856.55765,


In [4]:
spy_1m_df.shape

(214794, 87)

In [5]:
# Check if the Datetime index is sorted
if spy_1m_df.index.is_monotonic_increasing:
    print("The index is sorted.")
else:
    print("The index is not sorted. Sorting now.")
    spy_1m_df.sort_index(inplace=True)

The index is sorted.


In [6]:
spy_1m_df = spy_1m_df.reset_index()

#### lags

In [7]:
for lags in range(1, 16):
    for price in ['open', 'high', 'low', 'close']:
        spy_1m_df[f'{price}_lag_{lags}'] = spy_1m_df[f'{price}'].shift(lags)

In [8]:
spy_1m_df.shape

(214794, 148)

#### direction changes

In [9]:
def calculate_direction_changes(df, price_column='close'):
    df['price_change'] = df[price_column].diff()
    df['direction'] = df['price_change'].apply(lambda x: 'up' if x > 0 else 'down' if x < 0 else 'no change')
    df['direction_change'] = df['direction'].diff().ne(0) & df['direction'].ne('no change')
    direction_changes = df.groupby(['symbol', pd.Grouper(freq='D')])['direction_change'].sum().reset_index()
    direction_changes.rename(columns={'direction_change': 'daily_direction_changes'}, inplace=True)
    return direction_changes

In [10]:
spy_1m_df_backup = spy_1m_df.copy()
# spy_1m_df = spy_1m_df_backup.copy()

In [11]:
# calculate_direction_changes(spy_1m_df[['close']])
spy_1m_df['date'] = spy_1m_df.us_eastern_timestamp.dt.date
spy_1m_df['price_change'] = spy_1m_df['close'].diff()
spy_1m_df['direction'] = spy_1m_df['price_change'].apply(lambda x: 'up' if x > 0 else 'down' if x < 0 else 'no change')
spy_1m_df['direction_prev'] = spy_1m_df['direction'].shift()
spy_1m_df['cumulative_ups'] = spy_1m_df['direction']=='up'
spy_1m_df['cumulative_ups'] = spy_1m_df.groupby('date').cumulative_ups.cumsum()
spy_1m_df['cumulative_downs'] = spy_1m_df['direction']=='down'
spy_1m_df['cumulative_downs'] = spy_1m_df.groupby('date').cumulative_downs.cumsum()
spy_1m_df['direction_change_up_to_down'] = (spy_1m_df.direction=='down')&(spy_1m_df.direction_prev=='up')
spy_1m_df['direction_change_down_to_up'] = (spy_1m_df.direction=='up')&(spy_1m_df.direction_prev=='down')
spy_1m_df['prev_date'] = spy_1m_df['date'].shift()
spy_1m_df['prev_cumulative_ups'] = spy_1m_df['cumulative_ups'].shift()
spy_1m_df['prev_cumulative_downs'] = spy_1m_df['cumulative_downs'].shift()
spy_1m_df['prev_cumulative_ups'] = np.where(spy_1m_df.prev_date==spy_1m_df.date, np.nan, spy_1m_df.prev_cumulative_ups)
spy_1m_df['prev_cumulative_downs'] = np.where(spy_1m_df.prev_date==spy_1m_df.date, np.nan, spy_1m_df.prev_cumulative_downs)
spy_1m_df['prev_cumulative_ups'] = spy_1m_df['prev_cumulative_ups'].ffill()
spy_1m_df['prev_cumulative_downs'] = spy_1m_df['prev_cumulative_downs'].ffill()
spy_1m_df[['date', 'prev_date', 'cumulative_ups', 'prev_cumulative_ups']]
spy_1m_df[['date', 'prev_date', 'cumulative_downs', 'prev_cumulative_downs']]
spy_1m_df.drop(columns=['direction_prev', 'date', 'prev_date', 'symbol', 'time_delta'], inplace=True)
spy_1m_df.tail(13)

Unnamed: 0,us_eastern_timestamp,open,high,low,close,volume,open_ema_2m,open_sma_2m,high_ema_2m,high_sma_2m,...,low_lag_15,close_lag_15,price_change,direction,cumulative_ups,cumulative_downs,direction_change_up_to_down,direction_change_down_to_up,prev_cumulative_ups,prev_cumulative_downs
214781,2024-11-19 19:47:00,591.63,591.63,591.63,591.63,250.0,591.637,591.645,591.638002,591.645,...,590.78,590.87,-0.02,down,382,359,True,False,391.0,363.0
214782,2024-11-19 19:48:00,591.67,591.67,591.67,591.67,159.0,591.659,591.65,591.659334,591.65,...,590.78,590.87,0.04,up,383,359,False,True,391.0,363.0
214783,2024-11-19 19:49:00,591.67,591.67,591.67,591.67,159.0,591.666333,591.67,591.666445,591.67,...,590.78,590.87,0.0,no change,383,359,False,False,391.0,363.0
214784,2024-11-19 19:50:00,591.68,591.68,591.63,591.63,1473.0,591.675444,591.675,591.675482,591.675,...,590.78,590.87,-0.04,down,383,360,False,False,391.0,363.0
214785,2024-11-19 19:51:00,591.6,591.6099,591.6,591.6099,963.0,591.625148,591.64,591.631761,591.64495,...,590.78,590.87,-0.0201,down,383,361,False,False,391.0,363.0
214786,2024-11-19 19:52:00,591.58,591.58,591.58,591.58,597.0,591.595049,591.59,591.597254,591.59495,...,590.78,590.87,-0.0299,down,383,362,False,False,391.0,363.0
214787,2024-11-19 19:53:00,591.66,591.66,591.55,591.55,1559.0,591.63835,591.62,591.639085,591.62,...,590.78,590.87,-0.03,down,383,363,False,False,391.0,363.0
214788,2024-11-19 19:54:00,591.53,591.53,591.5,591.52,1052.0,591.566117,591.595,591.566362,591.595,...,590.78,590.87,-0.03,down,383,364,False,False,391.0,363.0
214789,2024-11-19 19:55:00,591.53,591.53,591.5,591.52,1052.0,591.542039,591.53,591.542121,591.53,...,591.56,591.56,0.0,no change,383,364,False,False,391.0,363.0
214790,2024-11-19 19:56:00,591.52,591.54,591.48,591.48,3823.0,591.527346,591.525,591.540707,591.535,...,591.56,591.56,-0.04,down,383,365,False,False,391.0,363.0


In [12]:
spy_1m_df['hour'] = spy_1m_df.us_eastern_timestamp.dt.hour
spy_1m_df['minute'] = spy_1m_df.us_eastern_timestamp.dt.minute
spy_1m_df['day_of_year'] = spy_1m_df.us_eastern_timestamp.dt.day_of_year

In [13]:
fields = []
for window_size in [5, 10, 15, 30]:
    spy_1m_df[f'local_max_{window_size}'] = (spy_1m_df['close'] >= spy_1m_df['close'].rolling(window=window_size, closed='left').max())
    spy_1m_df[f'local_min_{window_size}'] = (spy_1m_df['close'] <= spy_1m_df['close'].rolling(window=window_size, closed='left').min())

    # Cumulative count of rolling local maxima and minima
    spy_1m_df[f'cumulative_local_max_{window_size}'] = spy_1m_df[f'local_max_{window_size}'].cumsum()
    spy_1m_df[f'cumulative_local_min_{window_size}'] = spy_1m_df[f'local_min_{window_size}'].cumsum()

    distance_to_last_one = (spy_1m_df[f'local_max_{window_size}'].groupby((spy_1m_df[f'local_max_{window_size}'] == 1).cumsum()).cumcount()) * (spy_1m_df[f'local_max_{window_size}'] == 0) 
    spy_1m_df[f'time_since_prev_max_{window_size}'] = distance_to_last_one

    distance_to_last_one = (spy_1m_df[f'local_min_{window_size}'].groupby((spy_1m_df[f'local_min_{window_size}'] == 1).cumsum()).cumcount()) * (spy_1m_df[f'local_min_{window_size}'] == 0) 
    spy_1m_df[f'time_since_prev_min_{window_size}'] = distance_to_last_one
    fields += [f'local_max_{window_size}', f'local_min_{window_size}', f'cumulative_local_max_{window_size}', f'cumulative_local_min_{window_size}', f'time_since_prev_max_{window_size}', f'time_since_prev_min_{window_size}']

In [14]:
spy_1m_df[['close'] + fields].tail()

Unnamed: 0,close,local_max_5,local_min_5,cumulative_local_max_5,cumulative_local_min_5,time_since_prev_max_5,time_since_prev_min_5,local_max_10,local_min_10,cumulative_local_max_10,...,cumulative_local_max_15,cumulative_local_min_15,time_since_prev_max_15,time_since_prev_min_15,local_max_30,local_min_30,cumulative_local_max_30,cumulative_local_min_30,time_since_prev_max_30,time_since_prev_min_30
214789,591.52,False,True,68985,64722,6,0,False,True,48468,...,39306,34643,6,0,False,False,27193,23140,6,16
214790,591.48,False,True,68985,64723,7,0,False,True,48468,...,39306,34644,7,0,False,False,27193,23140,7,17
214791,591.42,False,True,68985,64724,8,0,False,True,48468,...,39306,34645,8,0,False,False,27193,23140,8,18
214792,591.39,False,True,68985,64725,9,0,False,True,48468,...,39306,34646,9,0,False,False,27193,23140,9,19
214793,591.39,False,True,68985,64726,10,0,False,True,48468,...,39306,34647,10,0,False,False,27193,23140,10,20


In [15]:
# today's high till now
# today's low till now

In [16]:
spy_1m_df_loc = 's3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_1min_base_other_features.parquet'
spy_1m_df.to_parquet(spy_1m_df_loc)

In [17]:
# spy_1m_df.to_parquet('s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_1min_base_other_features.parquet')