In [1]:
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

### data

In [2]:
df1 = pd.read_parquet('stock_bars.parquet')
spy_df = df1[df1.symbol=='SPY']
spy_df['timestamp'] = pd.to_datetime(spy_df['timestamp'])
spy_df['us_eastern_timestamp'] = spy_df['timestamp'].dt.tz_convert('US/Eastern')
spy_df['us_eastern_date'] = spy_df.us_eastern_timestamp.dt.date
spy_df['market_open'] = spy_df.us_eastern_timestamp.between('09:30:00', '16:00:00')
spy_df.set_index('us_eastern_timestamp', inplace=True)

In [3]:
spy_df.head()

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-12-31 19:01:00-05:00,SPY,2020-01-01 00:01:00+00:00,322.36,322.36,322.36,322.36,1073.0,23.0,322.36,2019-12-31,False
2019-12-31 19:11:00-05:00,SPY,2020-01-01 00:11:00+00:00,322.38,322.38,322.38,322.38,300.0,6.0,322.38,2019-12-31,False
2019-12-31 19:12:00-05:00,SPY,2020-01-01 00:12:00+00:00,322.38,322.38,322.38,322.38,2400.0,16.0,322.38,2019-12-31,False
2019-12-31 19:14:00-05:00,SPY,2020-01-01 00:14:00+00:00,322.35,322.35,322.35,322.35,200.0,1.0,322.35,2019-12-31,False
2019-12-31 19:18:00-05:00,SPY,2020-01-01 00:18:00+00:00,322.38,322.38,322.38,322.38,1910.0,28.0,322.38,2019-12-31,False


In [10]:
spy_df_daily = pd.concat([
    spy_df.groupby('us_eastern_date').symbol.first(),
    spy_df.groupby('us_eastern_date').open.first(),
    spy_df.groupby('us_eastern_date').close.last(),
    spy_df.groupby('us_eastern_date').high.max(),
    spy_df.groupby('us_eastern_date').low.min(),
    spy_df.groupby('us_eastern_date').volume.sum()
    ], axis=1).reset_index()

In [5]:
spy_df_backup = spy_df.copy()

In [6]:
spy_df = spy_df.drop(columns=['timestamp'])
spy_df

Unnamed: 0_level_0,symbol,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-12-31 19:01:00-05:00,SPY,322.3600,322.36,322.3600,322.3600,1073.0,23.0,322.360000,2019-12-31,False
2019-12-31 19:11:00-05:00,SPY,322.3800,322.38,322.3800,322.3800,300.0,6.0,322.380000,2019-12-31,False
2019-12-31 19:12:00-05:00,SPY,322.3800,322.38,322.3800,322.3800,2400.0,16.0,322.380000,2019-12-31,False
2019-12-31 19:14:00-05:00,SPY,322.3500,322.35,322.3500,322.3500,200.0,1.0,322.350000,2019-12-31,False
2019-12-31 19:18:00-05:00,SPY,322.3800,322.38,322.3800,322.3800,1910.0,28.0,322.380000,2019-12-31,False
...,...,...,...,...,...,...,...,...,...,...
2024-11-07 14:06:00-05:00,SPY,594.7100,594.97,594.5000,594.9300,163155.0,1225.0,594.718549,2024-11-07,False
2024-11-07 14:07:00-05:00,SPY,594.8800,594.94,594.7601,594.8959,103768.0,760.0,594.845416,2024-11-07,False
2024-11-07 14:08:00-05:00,SPY,594.9000,595.15,594.8500,595.0400,63505.0,783.0,594.996543,2024-11-07,False
2024-11-07 14:09:00-05:00,SPY,595.0101,595.04,594.9000,595.0100,100873.0,726.0,594.953314,2024-11-07,False


### features

#### ema, sma, rsi

In [7]:
# ema, sma
 
def calculate_ma(df, ema=True, sma=True, all_windows=[x for x in range(1, 240)]):
    # minute_windows = [1, 2, 3, 5, 9, 10, 12, 10, 30]  # Minute-level windows
    # minute_windows = [x for x in range(1, 30)] + [50, 100, 200, 500]
    # hour_windows = [60, 120]  # Hour-level windows, converted to minutes
    # day_windows = [1440, 2880, 4320, 7200, 14400, 20160]  # Day-level windows, converted to minutes

    # Combine all windows for processing
    # all_windows = minute_windows + hour_windows + day_windows
    # all_windows = [x for x in range(1, 240)]

    # Calculate EMAs and SMAs for each window
    for window in tqdm(all_windows):
        for price in ['open', 'high', 'low', 'close', 'volume']:
            df[f'{price}_ema_{window}m'] = df[price].ewm(span=window, adjust=False).mean()
            df[f'{price}_sma_{window}m'] = df[price].rolling(window=window).mean()
    return df

In [None]:
spy_df = calculate_ma(spy_df, all_windows=[x for x in range(1, 240)])
spy_df_daily = calculate_ma(spy_df_daily, all_windows=[x for x in range(1, 240)])

100%|██████████| 239/239 [00:01<00:00, 174.20it/s]


In [None]:
spy_df.to_parquet('../data/spy_df_with_all_averages.parquet', index=False, partition_cols=['symbol',])
# spy_df.to_parquet('spy_df_with_all_averages_copy.parquet', index=False, partition_cols=['symbol',])
# spy_df.to_parquet('s3://sisyphus-general-bucket/AthenaInsights/temp_data/spy_df_with_all_averages.parquet', index=False, partition_cols=['symbol',])

In [None]:
spy_df_daily.to_parquet('../data/spy_df_daily_with_all_averages.parquet', index=False, partition_cols=['symbol',])
# spy_df_daily.to_parquet('spy_df_daily_with_all_averages_copy.parquet', index=False, partition_cols=['symbol', ])
# spy_df_daily.to_parquet('s3://sisyphus-general-bucket/AthenaInsights/temp_data/spy_df_daily_with_all_averages.parquet', index=False, partition_cols=['symbol', ])

In [25]:
spy_df[['open', 'open_sma_5m']].tail(1000).to_csv('testing.csv')