In [1]:
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [2]:
def process_dataset(df, symbol, datasets=None):
    df = df[df.symbol==symbol]
    if datasets is None:
        datasets = {}
    for dur in durations:
        new_dataset = df.resample(dur).agg({'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum'})
        datasets[dur] = new_dataset.copy()
    return datasets

### minute level data

In [3]:
bucket_loc = 's3://sisyphus-general-bucket/AthenaInsights'
data_folder = 'latest_data'

In [4]:
# stock_data_day_level_name: stock_bars_day.parquet
# stock_bars_hour_level_name: stock_bars_hour.parquet
# stock_bars_minute_level_name: stock_bars_minute.parquet

In [5]:
df = pd.read_parquet(f'{bucket_loc}/{data_folder}/parquet/stock_bars_minute.parquet')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['us_eastern_timestamp'] = df['timestamp'].dt.tz_convert('US/Eastern')
df['us_eastern_timestamp'] = df['us_eastern_timestamp'].dt.tz_localize(None)

df['us_eastern_date'] = df.us_eastern_timestamp.dt.date
# df['market_open'] = df.us_eastern_timestamp.between('09:30:00', '16:00:00')
df['market_open'] = (df.us_eastern_timestamp.dt.time>=pd.to_datetime('09:30:00').time()) & (df.us_eastern_timestamp.dt.time < pd.to_datetime('16:00:00').time())
df.set_index('us_eastern_timestamp', inplace=True)

In [6]:
df.head()

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-01-02 07:06:00,DAX,2024-01-02 12:06:00+00:00,30.8,30.8,30.8,30.8,100.0,1.0,30.8,2024-01-02,False
2024-01-02 09:30:00,DAX,2024-01-02 14:30:00+00:00,30.65,30.65,30.65,30.65,389.0,10.0,30.65,2024-01-02,True
2024-01-02 09:32:00,DAX,2024-01-02 14:32:00+00:00,30.541,30.58,30.541,30.56,2478.0,16.0,30.554801,2024-01-02,True
2024-01-02 09:37:00,DAX,2024-01-02 14:37:00+00:00,30.66,30.66,30.66,30.66,120.0,2.0,30.66,2024-01-02,True
2024-01-02 09:40:00,DAX,2024-01-02 14:40:00+00:00,30.585,30.64,30.585,30.64,738.0,3.0,30.599925,2024-01-02,True


In [7]:
datasets = {}
symbol = df.symbol.unique() # 'SPY'
durations = ['2min', '3min', '5min', '10min', '15min', '20min', '25min', '30min']
for sym in symbol:
    datasets[sym] = {}
    datasets[sym]['1min'] = df[df.symbol==sym].copy()
    datasets[sym] = process_dataset(df, sym, datasets[sym])

In [8]:
datasets.keys(), datasets['SPY'].keys()

(dict_keys(['DAX', 'DJIA', 'QQQ', 'SPY', 'VIXM']),
 dict_keys(['1min', '2min', '3min', '5min', '10min', '15min', '20min', '25min', '30min']))

### hour level data

In [9]:
# stock_data_day_level_name: stock_bars_day.parquet
# stock_bars_hour_level_name: stock_bars_hour.parquet
# stock_bars_minute_level_name: stock_bars_minute.parquet

In [10]:
df2 = pd.read_parquet(f'{bucket_loc}/{data_folder}/parquet/stock_bars_hour.parquet')
df2['timestamp'] = pd.to_datetime(df2['timestamp'])
df2['us_eastern_timestamp'] = df2['timestamp'].dt.tz_convert('US/Eastern')
df2['us_eastern_timestamp'] = df2['us_eastern_timestamp'].dt.tz_localize(None)

df2['us_eastern_date'] = df2.us_eastern_timestamp.dt.date
# df2['market_open'] = df2.us_eastern_timestamp.between('09:30:00', '16:00:00')
df2['market_open'] = (df2.us_eastern_timestamp.dt.time>=pd.to_datetime('09:30:00').time()) & (df2.us_eastern_timestamp.dt.time < pd.to_datetime('16:00:00').time())
df2.set_index('us_eastern_timestamp', inplace=True)

In [11]:
df2.head()

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-01-02 07:00:00,DAX,2024-01-02 12:00:00+00:00,30.8,30.8,30.8,30.8,100.0,1.0,30.8,2024-01-02,False
2024-01-02 09:00:00,DAX,2024-01-02 14:00:00+00:00,30.65,30.7,30.541,30.7,5091.0,38.0,30.599758,2024-01-02,False
2024-01-02 10:00:00,DAX,2024-01-02 15:00:00+00:00,30.651,30.69,30.62,30.6775,700.0,6.0,30.659786,2024-01-02,True
2024-01-02 11:00:00,DAX,2024-01-02 16:00:00+00:00,30.63,30.7,30.63,30.7,9456.0,3.0,30.689006,2024-01-02,True
2024-01-02 12:00:00,DAX,2024-01-02 17:00:00+00:00,30.64,30.69,30.64,30.69,853.0,2.0,30.678394,2024-01-02,True


In [12]:
symbol = df2.symbol.unique() # 'SPY'
durations = ['120min', '180min', '240min']
for sym in symbol:
    datasets[sym]['60min'] = df2[df2.symbol==sym].copy()
    datasets[sym] = process_dataset(df2, sym, datasets[sym])

In [13]:
datasets.keys(), datasets['SPY'].keys()

(dict_keys(['DAX', 'DJIA', 'QQQ', 'SPY', 'VIXM']),
 dict_keys(['1min', '2min', '3min', '5min', '10min', '15min', '20min', '25min', '30min', '60min', '120min', '180min', '240min']))

### day level data

In [14]:
# stock_data_day_level_name: stock_bars_day.parquet
# stock_bars_hour_level_name: stock_bars_hour.parquet
# stock_bars_minute_level_name: stock_bars_minute.parquet

In [15]:
df3 = pd.read_parquet(f'{bucket_loc}/{data_folder}/parquet/stock_bars_day.parquet')
df3['timestamp'] = pd.to_datetime(df3['timestamp'])
df3['us_eastern_timestamp'] = df3['timestamp'].dt.tz_convert('US/Eastern')
df3['us_eastern_timestamp'] = df3['us_eastern_timestamp'].dt.tz_localize(None)

df3['us_eastern_date'] = df3.us_eastern_timestamp.dt.date
# df3['market_open'] = df3.us_eastern_timestamp.between('09:30:00', '16:00:00')
df3['market_open'] = (df3.us_eastern_timestamp.dt.time>=pd.to_datetime('09:30:00').time()) & (df3.us_eastern_timestamp.dt.time < pd.to_datetime('16:00:00').time())
df3.set_index('us_eastern_timestamp', inplace=True)

In [16]:
df3.head()

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-01-02,DAX,2024-01-02 05:00:00+00:00,30.65,30.7,30.52,30.54,17548.0,95.0,30.655652,2024-01-02,False
2024-01-03,DAX,2024-01-03 05:00:00+00:00,30.11,30.18,30.03,30.08,6597.0,110.0,30.095682,2024-01-03,False
2024-01-04,DAX,2024-01-04 05:00:00+00:00,30.26,30.4105,30.25,30.25,25429.0,75.0,30.273637,2024-01-04,False
2024-01-05,DAX,2024-01-05 05:00:00+00:00,30.19,30.5,30.19,30.31,3231.0,67.0,30.356474,2024-01-05,False
2024-01-08,DAX,2024-01-08 05:00:00+00:00,30.55,30.89,30.52,30.7,7482.0,123.0,30.690812,2024-01-08,False


In [17]:
symbol = df3.symbol.unique() # 'SPY'
durations = ['2D', '3D', '5D', '10D', '15D', '20D', '30D', '50D', '100D', '150D', '200D']
for sym in symbol:
    datasets[sym]['1D'] = df3[df3.symbol==sym].copy()
    datasets[sym] = process_dataset(df3, sym, datasets[sym])

In [18]:
df2[df2.symbol=='SPY'].tail()

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-11-19 15:00:00,SPY,2024-11-19 20:00:00+00:00,589.63,590.6,589.365,590.29,9525290.0,66503.0,590.016005,2024-11-19,True
2024-11-19 16:00:00,SPY,2024-11-19 21:00:00+00:00,590.28,590.8,590.2,590.24,11162473.0,4219.0,590.378012,2024-11-19,False
2024-11-19 17:00:00,SPY,2024-11-19 22:00:00+00:00,590.15,590.2199,590.0,590.12,315825.0,743.0,590.056293,2024-11-19,False
2024-11-19 18:00:00,SPY,2024-11-19 23:00:00+00:00,590.12,590.89,590.1,590.87,58448.0,854.0,590.394854,2024-11-19,False
2024-11-19 19:00:00,SPY,2024-11-20 00:00:00+00:00,591.56,591.68,591.35,591.39,22825.0,357.0,591.526779,2024-11-19,False


### writing out to s3

In [19]:
all_durations = []
for sym in datasets.keys():
    for dur in datasets[sym].keys():
        all_durations.append(dur)
all_durations = set(all_durations)
print(all_durations)

{'15min', '50D', '20min', '1min', '2D', '25min', '60min', '240min', '120min', '5D', '10D', '10min', '30min', '30D', '200D', '180min', '1D', '5min', '20D', '150D', '100D', '2min', '15D', '3D', '3min'}


In [20]:
for dur in all_durations:
    dur_df = pd.DataFrame()
    for sym in datasets.keys():
        if dur in datasets[sym].keys():
            dur_df = pd.concat([dur_df, datasets[sym][dur].assign(symbol=sym)])
    dur_df.to_parquet(f'{bucket_loc}/{data_folder}/data_prep/stock_bars_{dur}.parquet')

### testing

In [21]:
date_f = '2024-11-04'

In [22]:
# x = df[df.symbol=='SPY'].reset_index()
x[(x.timestamp>=pd.to_datetime(f'2024-11-04 00:00:00+00:00'))&(x.timestamp<pd.to_datetime(f'2024-11-04 23:00:000+00:00'))].trade_count.sum()
x[(x.us_eastern_timestamp>=pd.to_datetime(f'2024-11-04 00:00:00'))&(x.us_eastern_timestamp<pd.to_datetime(f'2024-11-04 23:00:00'))]#.trade_count.sum()

NameError: name 'x' is not defined

In [None]:
y = df2[df2.symbol=='SPY']
y[y.us_eastern_date==pd.to_datetime(date_f)]

In [None]:
z = df3[df3.symbol=='SPY']
z[z.us_eastern_date==pd.to_datetime(date_f)]

In [None]:
# 