In [1]:
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [2]:
def process_dataset(df, symbol, datasets=None):
    df = df[df.symbol==symbol]
    if datasets is None:
        datasets = {}
    for dur in durations:
        new_dataset = df.resample(dur).agg({'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum'})
        datasets[dur] = new_dataset.copy()
    return datasets

### minute level data

In [3]:
bucket_loc = 's3://sisyphus-general-bucket/AthenaInsights'

In [4]:
# stock_data_day_level_name: stock_bars_day.parquet
# stock_bars_hour_level_name: stock_bars_hour.parquet
# stock_bars_minute_level_name: stock_bars_minute.parquet

In [5]:
df = pd.read_parquet(f'{bucket_loc}/data/parquet/stock_bars_minute.parquet')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['us_eastern_timestamp'] = df['timestamp'].dt.tz_convert('US/Eastern')
df['us_eastern_timestamp'] = df['us_eastern_timestamp'].dt.tz_localize(None)

df['us_eastern_date'] = df.us_eastern_timestamp.dt.date
# df['market_open'] = df.us_eastern_timestamp.between('09:30:00', '16:00:00')
df['market_open'] = (df.us_eastern_timestamp.dt.time>=pd.to_datetime('09:30:00').time()) & (df.us_eastern_timestamp.dt.time < pd.to_datetime('16:00:00').time())
df.set_index('us_eastern_timestamp', inplace=True)

In [6]:
df.head()

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-02 09:55:00,DAX,2020-01-02 14:55:00+00:00,28.5375,28.5375,28.5375,28.5375,100.0,1.0,28.5375,2020-01-02,True
2020-01-02 10:55:00,DAX,2020-01-02 15:55:00+00:00,28.4706,28.4706,28.4706,28.4706,438.0,1.0,28.4706,2020-01-02,True
2020-01-02 10:56:00,DAX,2020-01-02 15:56:00+00:00,28.405,28.405,28.405,28.405,116.0,2.0,28.405,2020-01-02,True
2020-01-02 10:57:00,DAX,2020-01-02 15:57:00+00:00,28.48,28.48,28.46,28.46,205.0,3.0,28.47,2020-01-02,True
2020-01-02 11:11:00,DAX,2020-01-02 16:11:00+00:00,28.6631,28.6631,28.6631,28.6631,100.0,1.0,28.6631,2020-01-02,True


In [7]:
datasets = {}
symbol = df.symbol.unique() # 'SPY'
durations = ['2min', '3min', '5min', '10min', '15min', '20min', '25min', '30min']
for sym in symbol:
    datasets[sym] = {}
    datasets[sym]['1min'] = df[df.symbol==sym].copy()
    datasets[sym] = process_dataset(df, sym, datasets[sym])

In [8]:
datasets.keys(), datasets['SPY'].keys()

(dict_keys(['DAX', 'DJIA', 'QQQ', 'SPY', 'VIXM']),
 dict_keys(['1min', '2min', '3min', '5min', '10min', '15min', '20min', '25min', '30min']))

### hour level data

In [9]:
# stock_data_day_level_name: stock_bars_day.parquet
# stock_bars_hour_level_name: stock_bars_hour.parquet
# stock_bars_minute_level_name: stock_bars_minute.parquet

In [10]:
df2 = pd.read_parquet(f'{bucket_loc}/data/parquet/stock_bars_hour.parquet')
df2['timestamp'] = pd.to_datetime(df2['timestamp'])
df2['us_eastern_timestamp'] = df2['timestamp'].dt.tz_convert('US/Eastern')
df2['us_eastern_timestamp'] = df2['us_eastern_timestamp'].dt.tz_localize(None)

df2['us_eastern_date'] = df2.us_eastern_timestamp.dt.date
# df2['market_open'] = df2.us_eastern_timestamp.between('09:30:00', '16:00:00')
df2['market_open'] = (df2.us_eastern_timestamp.dt.time>=pd.to_datetime('09:30:00').time()) & (df2.us_eastern_timestamp.dt.time < pd.to_datetime('16:00:00').time())
df2.set_index('us_eastern_timestamp', inplace=True)

In [11]:
df2.head()

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-02 09:00:00,DAX,2020-01-02 14:00:00+00:00,28.5375,28.5375,28.5375,28.5375,100.0,1.0,28.5375,2020-01-02,False
2020-01-02 10:00:00,DAX,2020-01-02 15:00:00+00:00,28.4706,28.48,28.405,28.46,759.0,6.0,28.461092,2020-01-02,True
2020-01-02 11:00:00,DAX,2020-01-02 16:00:00+00:00,28.6631,28.6631,28.6631,28.6631,100.0,1.0,28.6631,2020-01-02,True
2020-01-02 12:00:00,DAX,2020-01-02 17:00:00+00:00,28.4635,28.4635,28.4635,28.4635,349.0,1.0,28.4635,2020-01-02,True
2020-01-03 09:00:00,DAX,2020-01-03 14:00:00+00:00,27.98,27.98,27.98,27.98,241.0,6.0,27.98,2020-01-03,False


In [12]:
symbol = df2.symbol.unique() # 'SPY'
durations = ['120min', '180min', '240min']
for sym in symbol:
    datasets[sym]['60min'] = df2[df2.symbol==sym].copy()
    datasets[sym] = process_dataset(df2, sym, datasets[sym])

In [13]:
datasets.keys(), datasets['SPY'].keys()

(dict_keys(['DAX', 'DJIA', 'QQQ', 'SPY', 'VIXM']),
 dict_keys(['1min', '2min', '3min', '5min', '10min', '15min', '20min', '25min', '30min', '60min', '120min', '180min', '240min']))

### day level data

In [14]:
# stock_data_day_level_name: stock_bars_day.parquet
# stock_bars_hour_level_name: stock_bars_hour.parquet
# stock_bars_minute_level_name: stock_bars_minute.parquet

In [15]:
df3 = pd.read_parquet(f'{bucket_loc}/data/parquet/stock_bars_day.parquet')
df3['timestamp'] = pd.to_datetime(df3['timestamp'])
df3['us_eastern_timestamp'] = df3['timestamp'].dt.tz_convert('US/Eastern')
df3['us_eastern_timestamp'] = df3['us_eastern_timestamp'].dt.tz_localize(None)

df3['us_eastern_date'] = df3.us_eastern_timestamp.dt.date
# df3['market_open'] = df3.us_eastern_timestamp.between('09:30:00', '16:00:00')
df3['market_open'] = (df3.us_eastern_timestamp.dt.time>=pd.to_datetime('09:30:00').time()) & (df3.us_eastern_timestamp.dt.time < pd.to_datetime('16:00:00').time())
df3.set_index('us_eastern_timestamp', inplace=True)

In [16]:
df3.head()

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-02,DAX,2020-01-02 05:00:00+00:00,28.5375,28.6631,28.405,28.4635,1596.0,31.0,28.483275,2020-01-02,False
2020-01-03,DAX,2020-01-03 05:00:00+00:00,27.98,28.1114,27.9701,27.99,2848.0,25.0,28.027624,2020-01-03,False
2020-01-06,DAX,2020-01-06 05:00:00+00:00,27.8,27.8948,27.8,27.8948,975.0,15.0,27.825705,2020-01-06,False
2020-01-07,DAX,2020-01-07 05:00:00+00:00,28.04,28.0658,28.04,28.04,665.0,21.0,28.050874,2020-01-07,False
2020-01-08,DAX,2020-01-08 05:00:00+00:00,28.155,28.2122,28.155,28.2122,1724.0,35.0,28.184157,2020-01-08,False


In [17]:
symbol = df3.symbol.unique() # 'SPY'
durations = ['2D', '3D', '5D', '10D', '15D', '20D', '30D', '50D', '100D', '150D', '200D']
for sym in symbol:
    datasets[sym]['1D'] = df3[df3.symbol==sym].copy()
    datasets[sym] = process_dataset(df3, sym, datasets[sym])

In [18]:
df2[df2.symbol=='SPY'].tail()

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-11-11 13:00:00,SPY,2024-11-11 18:00:00+00:00,598.21,598.64,597.0,598.23,3361586.0,45160.0,597.773751,2024-11-11,True
2024-11-11 14:00:00,SPY,2024-11-11 19:00:00+00:00,598.24,598.41,598.0,598.2885,2356680.0,28995.0,598.197857,2024-11-11,True
2024-11-11 15:00:00,SPY,2024-11-11 20:00:00+00:00,598.29,598.84,597.98,598.72,8978604.0,64345.0,598.348699,2024-11-11,True
2024-11-11 16:00:00,SPY,2024-11-11 21:00:00+00:00,598.73,598.81,597.8633,598.38,6479844.0,5126.0,598.685855,2024-11-11,False
2024-11-11 17:00:00,SPY,2024-11-11 22:00:00+00:00,598.37,598.76,598.2,598.23,508368.0,787.0,598.685432,2024-11-11,False


### writing out to s3

In [19]:
all_durations = []
for sym in datasets.keys():
    for dur in datasets[sym].keys():
        all_durations.append(dur)
all_durations = set(all_durations)
print(all_durations)

{'180min', '2min', '10min', '60min', '1min', '15D', '30D', '15min', '20min', '50D', '150D', '100D', '30min', '120min', '25min', '3D', '3min', '10D', '20D', '5D', '240min', '5min', '1D', '2D', '200D'}


In [20]:
for dur in all_durations:
    dur_df = pd.DataFrame()
    for sym in datasets.keys():
        if dur in datasets[sym].keys():
            dur_df = pd.concat([dur_df, datasets[sym][dur].assign(symbol=sym)])
    dur_df.to_parquet(f'{bucket_loc}/data/data_prep/stock_bars_{dur}.parquet')

### testing

In [21]:
date_f = '2024-11-04'

In [22]:
# x = df[df.symbol=='SPY'].reset_index()
x[(x.timestamp>=pd.to_datetime(f'2024-11-04 00:00:00+00:00'))&(x.timestamp<pd.to_datetime(f'2024-11-04 23:00:000+00:00'))].trade_count.sum()
x[(x.us_eastern_timestamp>=pd.to_datetime(f'2024-11-04 00:00:00'))&(x.us_eastern_timestamp<pd.to_datetime(f'2024-11-04 23:00:00'))]#.trade_count.sum()

NameError: name 'x' is not defined

In [None]:
y = df2[df2.symbol=='SPY']
y[y.us_eastern_date==pd.to_datetime(date_f)]

In [None]:
z = df3[df3.symbol=='SPY']
z[z.us_eastern_date==pd.to_datetime(date_f)]

In [None]:
# 