In [18]:
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

### minute level data

In [19]:
# stock_data_day_level_name: stock_bars_day.parquet
# stock_bars_hour_level_name: stock_bars_hour.parquet
# stock_bars_minute_level_name: stock_bars_minute.parquet

In [20]:
df = pd.read_parquet('../data/parquet/stock_bars_minute.parquet')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['us_eastern_timestamp'] = df['timestamp'].dt.tz_convert('US/Eastern')
df['us_eastern_date'] = df.us_eastern_timestamp.dt.date
df['market_open'] = df.us_eastern_timestamp.between('09:30:00', '16:00:00')
df.set_index('us_eastern_timestamp', inplace=True)

In [21]:
df.head()

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-02 09:55:00-05:00,DAX,2020-01-02 14:55:00+00:00,28.5375,28.5375,28.5375,28.5375,100.0,1.0,28.5375,2020-01-02,False
2020-01-02 10:55:00-05:00,DAX,2020-01-02 15:55:00+00:00,28.4706,28.4706,28.4706,28.4706,438.0,1.0,28.4706,2020-01-02,False
2020-01-02 10:56:00-05:00,DAX,2020-01-02 15:56:00+00:00,28.405,28.405,28.405,28.405,116.0,2.0,28.405,2020-01-02,False
2020-01-02 10:57:00-05:00,DAX,2020-01-02 15:57:00+00:00,28.48,28.48,28.46,28.46,205.0,3.0,28.47,2020-01-02,False
2020-01-02 11:11:00-05:00,DAX,2020-01-02 16:11:00+00:00,28.6631,28.6631,28.6631,28.6631,100.0,1.0,28.6631,2020-01-02,False


In [22]:
def process_dataset(df, symbol, datasets=None):
    df = df[df.symbol==symbol]
    if datasets is None:
        datasets = {}
    for dur in durations:
        new_dataset = df.resample(dur).agg({'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum'})
        datasets[dur] = new_dataset.copy()
    return datasets


datasets = {}
symbol = df.symbol.unique() # 'SPY'
durations = ['2min', '3min', '5min', '10min', '15min', '20min', '25min', '30min']
for sym in symbol:
    datasets[sym] = {}
    datasets[sym] = process_dataset(df, sym)

In [23]:
datasets.keys(), datasets['SPY'].keys()

(dict_keys(['DAX', 'QQQ', 'SPY']),
 dict_keys(['2min', '3min', '5min', '10min', '15min', '20min', '25min', '30min']))

### hour level data

In [24]:
# stock_data_day_level_name: stock_bars_day.parquet
# stock_bars_hour_level_name: stock_bars_hour.parquet
# stock_bars_minute_level_name: stock_bars_minute.parquet

In [25]:
df = pd.read_parquet('../data/parquet/stock_bars_hour.parquet')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['us_eastern_timestamp'] = df['timestamp'].dt.tz_convert('US/Eastern')
df['us_eastern_date'] = df.us_eastern_timestamp.dt.date
df['market_open'] = df.us_eastern_timestamp.between('09:30:00', '16:00:00')
df.set_index('us_eastern_timestamp', inplace=True)

In [26]:
df

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-02 09:00:00-05:00,DAX,2020-01-02 14:00:00+00:00,28.5375,28.5375,28.5375,28.5375,100.0,1.0,28.537500,2020-01-02,False
2020-01-02 10:00:00-05:00,DAX,2020-01-02 15:00:00+00:00,28.4706,28.4800,28.4050,28.4600,759.0,6.0,28.461092,2020-01-02,False
2020-01-02 11:00:00-05:00,DAX,2020-01-02 16:00:00+00:00,28.6631,28.6631,28.6631,28.6631,100.0,1.0,28.663100,2020-01-02,False
2020-01-02 12:00:00-05:00,DAX,2020-01-02 17:00:00+00:00,28.4635,28.4635,28.4635,28.4635,349.0,1.0,28.463500,2020-01-02,False
2020-01-03 09:00:00-05:00,DAX,2020-01-03 14:00:00+00:00,27.9800,27.9800,27.9800,27.9800,241.0,6.0,27.980000,2020-01-03,False
...,...,...,...,...,...,...,...,...,...,...,...
2024-11-08 15:00:00-05:00,SPY,2024-11-08 20:00:00+00:00,598.9200,598.9800,597.6200,598.2400,13676393.0,81612.0,598.076688,2024-11-08,False
2024-11-08 16:00:00-05:00,SPY,2024-11-08 21:00:00+00:00,598.2400,598.4300,597.8833,597.9200,8072452.0,5641.0,598.064492,2024-11-08,False
2024-11-08 17:00:00-05:00,SPY,2024-11-08 22:00:00+00:00,597.9800,598.0800,597.7100,598.0300,221692.0,811.0,597.976766,2024-11-08,False
2024-11-08 18:00:00-05:00,SPY,2024-11-08 23:00:00+00:00,598.0300,598.0400,597.8200,598.0300,10819.0,316.0,597.925439,2024-11-08,False


In [27]:
def process_dataset(df, symbol, datasets=None):
    df = df[df.symbol==symbol]
    if datasets is None:
        datasets = {}
    for dur in durations:
        new_dataset = df.resample(dur).agg({'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum'})
        datasets[dur] = new_dataset.copy()
    return datasets


symbol = df.symbol.unique() # 'SPY'
durations = ['60min', '120min', '180min', ]
for sym in symbol:
    datasets[sym] = process_dataset(df, sym, datasets[sym])

In [28]:
datasets.keys(), datasets['SPY'].keys()

(dict_keys(['DAX', 'QQQ', 'SPY']),
 dict_keys(['2min', '3min', '5min', '10min', '15min', '20min', '25min', '30min', '60min', '120min', '180min']))

### day level data

In [None]:
# stock_data_day_level_name: stock_bars_day.parquet
# stock_bars_hour_level_name: stock_bars_hour.parquet
# stock_bars_minute_level_name: stock_bars_minute.parquet

In [31]:
df = pd.read_parquet('../data/parquet/stock_bars_day.parquet')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['us_eastern_timestamp'] = df['timestamp'].dt.tz_convert('US/Eastern')
df['us_eastern_date'] = df.us_eastern_timestamp.dt.date
df['market_open'] = df.us_eastern_timestamp.between('09:30:00', '16:00:00')
df.set_index('us_eastern_timestamp', inplace=True)

In [32]:
df

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-02 00:00:00-05:00,DAX,2020-01-02 05:00:00+00:00,28.5375,28.6631,28.4050,28.4635,1596.0,31.0,28.483275,2020-01-02,False
2020-01-03 00:00:00-05:00,DAX,2020-01-03 05:00:00+00:00,27.9800,28.1114,27.9701,27.9900,2848.0,25.0,28.027624,2020-01-03,False
2020-01-06 00:00:00-05:00,DAX,2020-01-06 05:00:00+00:00,27.8000,27.8948,27.8000,27.8948,975.0,15.0,27.825705,2020-01-06,False
2020-01-07 00:00:00-05:00,DAX,2020-01-07 05:00:00+00:00,28.0400,28.0658,28.0400,28.0400,665.0,21.0,28.050874,2020-01-07,False
2020-01-08 00:00:00-05:00,DAX,2020-01-08 05:00:00+00:00,28.1550,28.2122,28.1550,28.2122,1724.0,35.0,28.184157,2020-01-08,False
...,...,...,...,...,...,...,...,...,...,...,...
2024-11-04 00:00:00-05:00,SPY,2024-11-04 05:00:00+00:00,571.1800,572.5000,567.8900,569.8100,38216975.0,394247.0,570.278487,2024-11-04,False
2024-11-05 00:00:00-05:00,SPY,2024-11-05 05:00:00+00:00,570.7400,576.7400,570.5200,576.7000,39478322.0,378253.0,575.077202,2024-11-05,False
2024-11-06 00:00:00-05:00,SPY,2024-11-06 05:00:00+00:00,589.2000,591.9300,585.3900,591.0400,68181968.0,666095.0,589.331949,2024-11-06,False
2024-11-07 00:00:00-05:00,SPY,2024-11-07 05:00:00+00:00,593.0800,596.6500,592.9999,595.6100,47233212.0,427536.0,594.921264,2024-11-07,False


In [38]:
def process_dataset(df, symbol, datasets=None):
    df = df[df.symbol==symbol]
    if datasets is None:
        datasets = {}
    for dur in durations:
        new_dataset = df.resample(dur).agg({'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum'})
        datasets[dur] = new_dataset.copy()
    return datasets


symbol = df.symbol.unique() # 'SPY'
durations = ['1D', '2D', '3D', ]
for sym in symbol:
    datasets[sym] = process_dataset(df, sym, datasets[sym])

In [39]:
datasets.keys(), datasets['SPY'].keys()

(dict_keys(['DAX', 'QQQ', 'SPY']),
 dict_keys(['2min', '3min', '5min', '10min', '15min', '20min', '25min', '30min', '60min', '120min', '180min', '1D', '2D', '3D']))

### extras

In [10]:
spy_df_daily = pd.concat([
    spy_df.groupby('us_eastern_date').symbol.first(),
    spy_df.groupby('us_eastern_date').open.first(),
    spy_df.groupby('us_eastern_date').close.last(),
    spy_df.groupby('us_eastern_date').high.max(),
    spy_df.groupby('us_eastern_date').low.min(),
    spy_df.groupby('us_eastern_date').volume.sum()
    ], axis=1).reset_index()

In [5]:
spy_df_backup = spy_df.copy()

In [6]:
spy_df = spy_df.drop(columns=['timestamp'])
spy_df

Unnamed: 0_level_0,symbol,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-12-31 19:01:00-05:00,SPY,322.3600,322.36,322.3600,322.3600,1073.0,23.0,322.360000,2019-12-31,False
2019-12-31 19:11:00-05:00,SPY,322.3800,322.38,322.3800,322.3800,300.0,6.0,322.380000,2019-12-31,False
2019-12-31 19:12:00-05:00,SPY,322.3800,322.38,322.3800,322.3800,2400.0,16.0,322.380000,2019-12-31,False
2019-12-31 19:14:00-05:00,SPY,322.3500,322.35,322.3500,322.3500,200.0,1.0,322.350000,2019-12-31,False
2019-12-31 19:18:00-05:00,SPY,322.3800,322.38,322.3800,322.3800,1910.0,28.0,322.380000,2019-12-31,False
...,...,...,...,...,...,...,...,...,...,...
2024-11-07 14:06:00-05:00,SPY,594.7100,594.97,594.5000,594.9300,163155.0,1225.0,594.718549,2024-11-07,False
2024-11-07 14:07:00-05:00,SPY,594.8800,594.94,594.7601,594.8959,103768.0,760.0,594.845416,2024-11-07,False
2024-11-07 14:08:00-05:00,SPY,594.9000,595.15,594.8500,595.0400,63505.0,783.0,594.996543,2024-11-07,False
2024-11-07 14:09:00-05:00,SPY,595.0101,595.04,594.9000,595.0100,100873.0,726.0,594.953314,2024-11-07,False


### features

#### ema, sma, rsi

In [7]:
# ema, sma
 
def calculate_ma(df, ema=True, sma=True, all_windows=[x for x in range(1, 240)]):
    # minute_windows = [1, 2, 3, 5, 9, 10, 12, 10, 30]  # Minute-level windows
    # minute_windows = [x for x in range(1, 30)] + [50, 100, 200, 500]
    # hour_windows = [60, 120]  # Hour-level windows, converted to minutes
    # day_windows = [1440, 2880, 4320, 7200, 14400, 20160]  # Day-level windows, converted to minutes

    # Combine all windows for processing
    # all_windows = minute_windows + hour_windows + day_windows
    # all_windows = [x for x in range(1, 240)]

    # Calculate EMAs and SMAs for each window
    for window in tqdm(all_windows):
        for price in ['open', 'high', 'low', 'close', 'volume']:
            df[f'{price}_ema_{window}m'] = df[price].ewm(span=window, adjust=False).mean()
            df[f'{price}_sma_{window}m'] = df[price].rolling(window=window).mean()
    return df

In [None]:
spy_df = calculate_ma(spy_df, all_windows=[x for x in range(1, 240)])
spy_df_daily = calculate_ma(spy_df_daily, all_windows=[x for x in range(1, 240)])

100%|██████████| 239/239 [00:01<00:00, 174.20it/s]


In [None]:
spy_df.to_parquet('../data/spy_df_with_all_averages.parquet', index=False, partition_cols=['symbol',])
# spy_df.to_parquet('spy_df_with_all_averages_copy.parquet', index=False, partition_cols=['symbol',])
# spy_df.to_parquet('s3://sisyphus-general-bucket/AthenaInsights/temp_data/spy_df_with_all_averages.parquet', index=False, partition_cols=['symbol',])

In [None]:
spy_df_daily.to_parquet('../data/spy_df_daily_with_all_averages.parquet', index=False, partition_cols=['symbol',])
# spy_df_daily.to_parquet('spy_df_daily_with_all_averages_copy.parquet', index=False, partition_cols=['symbol', ])
# spy_df_daily.to_parquet('s3://sisyphus-general-bucket/AthenaInsights/temp_data/spy_df_daily_with_all_averages.parquet', index=False, partition_cols=['symbol', ])

In [25]:
spy_df[['open', 'open_sma_5m']].tail(1000).to_csv('testing.csv')