In [1]:
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [2]:
def process_dataset(df, symbol, datasets=None):
    df = df[df.symbol==symbol]
    if datasets is None:
        datasets = {}
    for dur in durations:
        new_dataset = df.resample(dur).agg({'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum'})
        datasets[dur] = new_dataset.copy()
    return datasets

### minute level data

In [3]:
bucket_loc = 's3://sisyphus-general-bucket/AthenaInsights'

In [4]:
# stock_data_day_level_name: stock_bars_day.parquet
# stock_bars_hour_level_name: stock_bars_hour.parquet
# stock_bars_minute_level_name: stock_bars_minute.parquet

In [5]:
df = pd.read_parquet(f'{bucket_loc}/data/parquet/stock_bars_minute.parquet')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['us_eastern_timestamp'] = df['timestamp'].dt.tz_convert('US/Eastern')
df['us_eastern_date'] = df.us_eastern_timestamp.dt.date
df['market_open'] = df.us_eastern_timestamp.between('09:30:00', '16:00:00')
df.set_index('us_eastern_timestamp', inplace=True)

In [6]:
df.head()

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-02 09:55:00-05:00,DAX,2020-01-02 14:55:00+00:00,28.5375,28.5375,28.5375,28.5375,100.0,1.0,28.5375,2020-01-02,False
2020-01-02 10:55:00-05:00,DAX,2020-01-02 15:55:00+00:00,28.4706,28.4706,28.4706,28.4706,438.0,1.0,28.4706,2020-01-02,False
2020-01-02 10:56:00-05:00,DAX,2020-01-02 15:56:00+00:00,28.405,28.405,28.405,28.405,116.0,2.0,28.405,2020-01-02,False
2020-01-02 10:57:00-05:00,DAX,2020-01-02 15:57:00+00:00,28.48,28.48,28.46,28.46,205.0,3.0,28.47,2020-01-02,False
2020-01-02 11:11:00-05:00,DAX,2020-01-02 16:11:00+00:00,28.6631,28.6631,28.6631,28.6631,100.0,1.0,28.6631,2020-01-02,False


In [7]:
datasets = {}
symbol = df.symbol.unique() # 'SPY'
durations = ['2min', '3min', '5min', '10min', '15min', '20min', '25min', '30min']
for sym in symbol:
    datasets[sym] = {}
    datasets[sym]['1min'] = df[df.symbol==sym].copy()
    datasets[sym] = process_dataset(df, sym, datasets[sym])

In [8]:
datasets.keys(), datasets['SPY'].keys()

(dict_keys(['DAX', 'DJIA', 'QQQ', 'SPY', 'VIXM']),
 dict_keys(['1min', '2min', '3min', '5min', '10min', '15min', '20min', '25min', '30min']))

### hour level data

In [9]:
# stock_data_day_level_name: stock_bars_day.parquet
# stock_bars_hour_level_name: stock_bars_hour.parquet
# stock_bars_minute_level_name: stock_bars_minute.parquet

In [10]:
df2 = pd.read_parquet(f'{bucket_loc}/data/parquet/stock_bars_hour.parquet')
df2['timestamp'] = pd.to_datetime(df2['timestamp'])
df2['us_eastern_timestamp'] = df2['timestamp'].dt.tz_convert('US/Eastern')
df2['us_eastern_date'] = df2.us_eastern_timestamp.dt.date
df2['market_open'] = df2.us_eastern_timestamp.between('09:30:00', '16:00:00')
df2.set_index('us_eastern_timestamp', inplace=True)

In [11]:
df2.head()

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-02 09:00:00-05:00,DAX,2020-01-02 14:00:00+00:00,28.5375,28.5375,28.5375,28.5375,100.0,1.0,28.5375,2020-01-02,False
2020-01-02 10:00:00-05:00,DAX,2020-01-02 15:00:00+00:00,28.4706,28.48,28.405,28.46,759.0,6.0,28.461092,2020-01-02,False
2020-01-02 11:00:00-05:00,DAX,2020-01-02 16:00:00+00:00,28.6631,28.6631,28.6631,28.6631,100.0,1.0,28.6631,2020-01-02,False
2020-01-02 12:00:00-05:00,DAX,2020-01-02 17:00:00+00:00,28.4635,28.4635,28.4635,28.4635,349.0,1.0,28.4635,2020-01-02,False
2020-01-03 09:00:00-05:00,DAX,2020-01-03 14:00:00+00:00,27.98,27.98,27.98,27.98,241.0,6.0,27.98,2020-01-03,False


In [13]:
symbol = df2.symbol.unique() # 'SPY'
durations = ['120min', '180min', '240min']
for sym in symbol:
    datasets[sym]['60min'] = df2[df2.symbol==sym].copy()
    datasets[sym] = process_dataset(df2, sym, datasets[sym])

In [14]:
datasets.keys(), datasets['SPY'].keys()

(dict_keys(['DAX', 'DJIA', 'QQQ', 'SPY', 'VIXM']),
 dict_keys(['1min', '2min', '3min', '5min', '10min', '15min', '20min', '25min', '30min', '60min', '120min', '180min', '240min']))

### day level data

In [15]:
# stock_data_day_level_name: stock_bars_day.parquet
# stock_bars_hour_level_name: stock_bars_hour.parquet
# stock_bars_minute_level_name: stock_bars_minute.parquet

In [16]:
df3 = pd.read_parquet(f'{bucket_loc}/data/parquet/stock_bars_day.parquet')
df3['timestamp'] = pd.to_datetime(df3['timestamp'])
df3['us_eastern_timestamp'] = df3['timestamp'].dt.tz_convert('US/Eastern')
df3['us_eastern_date'] = df3.us_eastern_timestamp.dt.date
df3['market_open'] = df3.us_eastern_timestamp.between('09:30:00', '16:00:00')
df3.set_index('us_eastern_timestamp', inplace=True)

In [17]:
df3.head()

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-02 00:00:00-05:00,DAX,2020-01-02 05:00:00+00:00,28.5375,28.6631,28.405,28.4635,1596.0,31.0,28.483275,2020-01-02,False
2020-01-03 00:00:00-05:00,DAX,2020-01-03 05:00:00+00:00,27.98,28.1114,27.9701,27.99,2848.0,25.0,28.027624,2020-01-03,False
2020-01-06 00:00:00-05:00,DAX,2020-01-06 05:00:00+00:00,27.8,27.8948,27.8,27.8948,975.0,15.0,27.825705,2020-01-06,False
2020-01-07 00:00:00-05:00,DAX,2020-01-07 05:00:00+00:00,28.04,28.0658,28.04,28.04,665.0,21.0,28.050874,2020-01-07,False
2020-01-08 00:00:00-05:00,DAX,2020-01-08 05:00:00+00:00,28.155,28.2122,28.155,28.2122,1724.0,35.0,28.184157,2020-01-08,False


In [18]:
symbol = df3.symbol.unique() # 'SPY'
durations = ['2D', '3D', '5D', '10D', '15D', '20D', '30D', '50D', '100D', '150D', '200D']
for sym in symbol:
    datasets[sym]['1D'] = df3[df3.symbol==sym].copy()
    datasets[sym] = process_dataset(df, sym, datasets[sym])

### writing out to s3

In [26]:
all_durations = []
for sym in datasets.keys():
    for dur in datasets[sym].keys():
        all_durations.append(dur)
all_durations = set(all_durations)
print(all_durations)

{'3min', '10D', '1min', '30min', '2min', '200D', '30D', '60min', '150D', '3D', '25min', '180min', '120min', '20min', '20D', '50D', '240min', '15D', '5D', '2D', '10min', '100D', '15min', '1D', '5min'}


In [41]:
for dur in all_durations:
    dur_df = pd.DataFrame()
    for sym in datasets.keys():
        if dur in datasets[sym].keys():
            dur_df = pd.concat([dur_df, datasets[sym][dur].assign(symbol=sym)])
    dur_df.to_parquet(f'{bucket_loc}/data/processed/stock_bars_{dur}.parquet')

### testing

In [19]:
datasets.keys(), datasets['SPY'].keys()

(dict_keys(['DAX', 'DJIA', 'QQQ', 'SPY', 'VIXM']),
 dict_keys(['1min', '2min', '3min', '5min', '10min', '15min', '20min', '25min', '30min', '60min', '120min', '180min', '240min', '1D', '2D', '3D', '5D', '10D', '15D', '20D', '30D', '50D', '100D', '150D', '200D']))

In [20]:
datasets['SPY']['1min'].tail()

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-11-11 17:13:00-05:00,SPY,2024-11-11 22:13:00+00:00,598.36,598.39,598.36,598.39,457.0,18.0,598.374925,2024-11-11,False
2024-11-11 17:14:00-05:00,SPY,2024-11-11 22:14:00+00:00,598.39,598.39,598.33,598.33,1163.0,38.0,598.354096,2024-11-11,False
2024-11-11 17:16:00-05:00,SPY,2024-11-11 22:16:00+00:00,598.39,598.39,598.39,598.39,1041.0,34.0,598.39,2024-11-11,False
2024-11-11 17:18:00-05:00,SPY,2024-11-11 22:18:00+00:00,598.3,598.3,598.27,598.27,4513.0,58.0,598.279848,2024-11-11,False
2024-11-11 17:19:00-05:00,SPY,2024-11-11 22:19:00+00:00,598.27,598.27,598.2699,598.27,3669.0,49.0,598.269997,2024-11-11,False


In [24]:
datasets['SPY']['5min'].tail()

Unnamed: 0_level_0,open,high,low,close,volume
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-11-11 16:55:00-05:00,598.54,598.54,598.38,598.38,14791.0
2024-11-11 17:00:00-05:00,598.37,598.76,598.31,598.3899,474198.0
2024-11-11 17:05:00-05:00,598.39,598.39,598.351,598.39,5213.0
2024-11-11 17:10:00-05:00,598.39,598.39,598.33,598.33,3785.0
2024-11-11 17:15:00-05:00,598.39,598.39,598.2699,598.27,9223.0
