In [1]:
import boto3
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

### data

In [2]:
client = boto3.client('s3')

In [3]:
s3_prefix = 's3://'
bucket = 'sisyphus-general-bucket'
primary_folder = 'AthenaInsights'

In [4]:
dependent_variable_loc = 's3://sisyphus-general-bucket/AthenaInsights/latest_data/dependent_variable/stock_bars_1min.parquet'
dependent_variable = pd.read_parquet(dependent_variable_loc)

In [5]:
dependent_variable.head()

Unnamed: 0_level_0,open,high,low,close,close_sma_5m,category,future_highs,future_lows,slopes
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-01-02 04:00:00,476.25,476.36,476.0,476.31,476.31,C,,,
2024-01-02 04:01:00,476.34,476.34,476.29,476.29,476.29,C,,,
2024-01-02 04:02:00,476.29,476.29,476.28,476.28,476.28,C,,,
2024-01-02 04:03:00,476.27,476.27,476.27,476.27,476.27,C,,,
2024-01-02 04:04:00,476.27,476.27,476.27,476.27,476.284,C,,,


In [6]:
dependent_variable = dependent_variable[['category']]

### independent variables

In [7]:
# spy_1m_df_loc = f'{s3_prefix}{bucket}/{primary_folder}/data/data_prep/stock_bars_1min.parquet'
spy_1m_df_loc = 's3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_1min_base.parquet'
spy_1m_df = pd.read_parquet(spy_1m_df_loc)
spy_1m_df = spy_1m_df[spy_1m_df.symbol=='SPY']

In [8]:
spy_1m_df.head()

Unnamed: 0_level_0,open,high,low,close,volume,symbol,time_delta,open_ema_2m,open_sma_2m,high_ema_2m,...,open_ema_26m,open_sma_26m,high_ema_26m,high_sma_26m,low_ema_26m,low_sma_26m,close_ema_26m,close_sma_26m,volume_ema_26m,volume_sma_26m
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-02 04:00:00,476.25,476.36,476.0,476.31,20460.0,SPY,,476.25,,476.36,...,476.25,,476.36,,476.0,,476.31,,20460.0,
2024-01-02 04:01:00,476.34,476.34,476.29,476.29,6369.0,SPY,,476.31,476.295,476.346667,...,476.256667,,476.358519,,476.021481,,476.308519,,19416.222222,
2024-01-02 04:02:00,476.29,476.29,476.28,476.28,6152.0,SPY,,476.296667,476.315,476.308889,...,476.259136,,476.353443,,476.040631,,476.306406,,18433.687243,
2024-01-02 04:03:00,476.27,476.27,476.27,476.27,369.0,SPY,,476.278889,476.28,476.282963,...,476.259941,,476.347262,,476.057621,,476.303709,,17095.562262,
2024-01-02 04:04:00,476.27,476.27,476.27,476.27,369.0,SPY,,476.272963,476.27,476.274321,...,476.260686,,476.341539,,476.073353,,476.301212,,15856.55765,


In [9]:
spy_1m_df.shape

(214794, 87)

In [10]:
# Check if the Datetime index is sorted
if spy_1m_df.index.is_monotonic_increasing:
    print("The index is sorted.")
else:
    print("The index is not sorted. Sorting now.")
    spy_1m_df.sort_index(inplace=True)

The index is sorted.


In [11]:
spy_1m_df = spy_1m_df.reset_index()

#### lags

In [12]:
for lags in range(1, 16):
    for price in ['open', 'high', 'low', 'close']:
        spy_1m_df[f'{price}_lag_{lags}'] = spy_1m_df[f'{price}'].shift(lags)

In [13]:
spy_1m_df.shape

(214794, 148)

#### direction changes

In [14]:
def calculate_direction_changes(df, price_column='close'):
    df['price_change'] = df[price_column].diff()
    df['direction'] = df['price_change'].apply(lambda x: 'up' if x > 0 else 'down' if x < 0 else 'no change')
    df['direction_change'] = df['direction'].diff().ne(0) & df['direction'].ne('no change')
    direction_changes = df.groupby(['symbol', pd.Grouper(freq='D')])['direction_change'].sum().reset_index()
    direction_changes.rename(columns={'direction_change': 'daily_direction_changes'}, inplace=True)
    return direction_changes

In [15]:
spy_1m_df_backup = spy_1m_df.copy()
# spy_1m_df = spy_1m_df_backup.copy()

In [19]:
# calculate_direction_changes(spy_1m_df[['close']])
spy_1m_df['date'] = spy_1m_df.us_eastern_timestamp.dt.date
spy_1m_df['price_change'] = spy_1m_df['close'].diff()
spy_1m_df['direction'] = spy_1m_df['price_change'].apply(lambda x: 'up' if x > 0 else 'down' if x < 0 else 'no change')
spy_1m_df['direction_prev'] = spy_1m_df['direction'].shift()
spy_1m_df['cumulative_ups'] = spy_1m_df['direction']=='up'
spy_1m_df['cumulative_ups'] = spy_1m_df.groupby('date').cumulative_ups.cumsum()
spy_1m_df['cumulative_downs'] = spy_1m_df['direction']=='down'
spy_1m_df['cumulative_downs'] = spy_1m_df.groupby('date').cumulative_downs.cumsum()
spy_1m_df['direction_change_up_to_down'] = (spy_1m_df.direction=='down')&(spy_1m_df.direction_prev=='up')
spy_1m_df['direction_change_down_to_up'] = (spy_1m_df.direction=='up')&(spy_1m_df.direction_prev=='down')
spy_1m_df['prev_date'] = spy_1m_df['date'].shift()
spy_1m_df['prev_cumulative_ups'] = spy_1m_df['cumulative_ups'].shift()
spy_1m_df['prev_cumulative_downs'] = spy_1m_df['cumulative_downs'].shift()
spy_1m_df['prev_cumulative_ups'] = np.where(spy_1m_df.prev_date==spy_1m_df.date, np.nan, spy_1m_df.prev_cumulative_ups)
spy_1m_df['prev_cumulative_downs'] = np.where(spy_1m_df.prev_date==spy_1m_df.date, np.nan, spy_1m_df.prev_cumulative_downs)
spy_1m_df['prev_cumulative_ups'] = spy_1m_df['prev_cumulative_ups'].ffill()
spy_1m_df['prev_cumulative_downs'] = spy_1m_df['prev_cumulative_downs'].ffill()
spy_1m_df[['date', 'prev_date', 'cumulative_ups', 'prev_cumulative_ups']]
spy_1m_df[['date', 'prev_date', 'cumulative_downs', 'prev_cumulative_downs']]
spy_1m_df.drop(columns=['direction_prev', 'date', 'prev_date', 'symbol', 'time_delta'], inplace=True)
spy_1m_df.tail(13)

Unnamed: 0,us_eastern_timestamp,open,high,low,close,volume,open_ema_2m,open_sma_2m,high_ema_2m,high_sma_2m,...,direction,cumulative_ups,cumulative_downs,direction_change_up_to_down,direction_change_down_to_up,prev_cumulative_ups,prev_cumulative_downs,hour,minute,day_of_year
214781,2024-11-19 19:47:00,591.63,591.63,591.63,591.63,250.0,591.637,591.645,591.638002,591.645,...,down,382,359,True,False,391.0,363.0,19,47,19
214782,2024-11-19 19:48:00,591.67,591.67,591.67,591.67,159.0,591.659,591.65,591.659334,591.65,...,up,383,359,False,True,391.0,363.0,19,48,19
214783,2024-11-19 19:49:00,591.67,591.67,591.67,591.67,159.0,591.666333,591.67,591.666445,591.67,...,no change,383,359,False,False,391.0,363.0,19,49,19
214784,2024-11-19 19:50:00,591.68,591.68,591.63,591.63,1473.0,591.675444,591.675,591.675482,591.675,...,down,383,360,False,False,391.0,363.0,19,50,19
214785,2024-11-19 19:51:00,591.6,591.6099,591.6,591.6099,963.0,591.625148,591.64,591.631761,591.64495,...,down,383,361,False,False,391.0,363.0,19,51,19
214786,2024-11-19 19:52:00,591.58,591.58,591.58,591.58,597.0,591.595049,591.59,591.597254,591.59495,...,down,383,362,False,False,391.0,363.0,19,52,19
214787,2024-11-19 19:53:00,591.66,591.66,591.55,591.55,1559.0,591.63835,591.62,591.639085,591.62,...,down,383,363,False,False,391.0,363.0,19,53,19
214788,2024-11-19 19:54:00,591.53,591.53,591.5,591.52,1052.0,591.566117,591.595,591.566362,591.595,...,down,383,364,False,False,391.0,363.0,19,54,19
214789,2024-11-19 19:55:00,591.53,591.53,591.5,591.52,1052.0,591.542039,591.53,591.542121,591.53,...,no change,383,364,False,False,391.0,363.0,19,55,19
214790,2024-11-19 19:56:00,591.52,591.54,591.48,591.48,3823.0,591.527346,591.525,591.540707,591.535,...,down,383,365,False,False,391.0,363.0,19,56,19


In [20]:
spy_1m_df['hour'] = spy_1m_df.us_eastern_timestamp.dt.hour
spy_1m_df['minute'] = spy_1m_df.us_eastern_timestamp.dt.minute
spy_1m_df['day_of_year'] = spy_1m_df.us_eastern_timestamp.dt.day

In [18]:
# spy_1m_df['yest_total_ups'] = spy_1m_df.groupby('date')['cumulative_ups'].max().transform(lambda x: x.shift().fillna(0))
# spy_1m_df['yest_total_downs'] = spy_1m_df.groupby('date')['cumulative_downs'].transform(lambda x: x.shift().fillna(0))
# spy_1m_df.merge(spy_1m_df.groupby('date')['cumulative_ups'].max().reset_index().rename(columns={'cumulative_ups': 'yest_total_ups'}))
# spy_1m_df.groupby('date').cumulative_ups.max()

date
2024-01-02    377
2024-01-03    355
2024-01-04    376
2024-01-05    365
2024-01-08    389
             ... 
2024-11-13    376
2024-11-14    380
2024-11-15    362
2024-11-18    391
2024-11-19    383
Name: cumulative_ups, Length: 224, dtype: int64

In [29]:
# spy_1m_df.groupby('date')['cumulative_ups'].transform(lambda x: x.max())
spy_1m_df.groupby('date')['cumulative_ups'].transform(lambda x: x.max())

0         False
1         False
2         False
3         False
4         False
          ...  
214789     True
214790     True
214791     True
214792     True
214793     True
Name: cumulative_ups, Length: 214794, dtype: bool