In [1]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [3]:
def read_df(path, columns=None):
    df = pq.read_pandas(path, columns=columns).to_pandas()
    return df

In [4]:
path = 's3://sisyphus-general-bucket/AthenaInsights/latest_data/data_prep/stock_bars_1min.parquet'
data_prep = read_df(path)

In [5]:
data_prep.shape

(746839, 9)

In [11]:
data_prep[data_prep.symbol=='SPY']

Unnamed: 0_level_0,symbol,open,high,low,close,volume,trade_count,vwap,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-01-02 04:00:00,SPY,476.25,476.36,476.00,476.3100,20460.0,84.0,476.301058,False
2024-01-02 04:01:00,SPY,476.34,476.34,476.29,476.2900,6369.0,16.0,476.320154,False
2024-01-02 04:02:00,SPY,476.29,476.29,476.28,476.2800,6152.0,10.0,476.280164,False
2024-01-02 04:03:00,SPY,476.27,476.27,476.27,476.2700,369.0,10.0,476.270000,False
2024-01-02 04:04:00,SPY,476.27,476.27,476.27,476.2700,369.0,10.0,476.270000,False
...,...,...,...,...,...,...,...,...,...
2024-12-03 18:53:00,SPY,604.46,604.46,604.46,604.4600,393.0,21.0,604.460000,False
2024-12-03 18:54:00,SPY,604.46,604.46,604.46,604.4600,393.0,21.0,604.460000,False
2024-12-03 18:55:00,SPY,604.46,604.46,604.46,604.4600,393.0,21.0,604.460000,False
2024-12-03 18:56:00,SPY,604.51,604.51,604.51,604.5100,427.0,21.0,604.510000,False


In [7]:
dependent_var_loc = 's3://sisyphus-general-bucket/AthenaInsights/latest_data/model/data/stock_bars_1min_base_avg_base_rsi_base_macd_base_otherfeatures_base_avg.parquet'
dependent_var = read_df(dependent_var_loc)

In [8]:
dependent_var

Unnamed: 0_level_0,symbol,open,high,low,close,volume,trade_count,vwap,market_open,symbol1,...,time_since_prev_max_30,time_since_prev_min_30,max_today,min_today,max_today_session,min_today_session,category,future_highs,future_lows,slopes
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-02 04:00:00,SPY,476.25,476.36,476.00,476.3100,20460.0,84.0,476.301058,False,SPY,...,0,0,476.36,476.000,476.36,476.00,C,,,
2024-01-02 04:01:00,SPY,476.34,476.34,476.29,476.2900,6369.0,16.0,476.320154,False,SPY,...,1,1,476.36,476.000,476.36,476.00,C,,,
2024-01-02 04:02:00,SPY,476.29,476.29,476.28,476.2800,6152.0,10.0,476.280164,False,SPY,...,2,2,476.36,476.000,476.36,476.00,C,,,
2024-01-02 04:03:00,SPY,476.27,476.27,476.27,476.2700,369.0,10.0,476.270000,False,SPY,...,3,3,476.36,476.000,476.36,476.00,C,,,
2024-01-02 04:04:00,SPY,476.27,476.27,476.27,476.2700,369.0,10.0,476.270000,False,SPY,...,4,4,476.36,476.000,476.36,476.00,C,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-03 18:53:00,SPY,604.46,604.46,604.46,604.4600,393.0,21.0,604.460000,False,SPY,...,3,52,604.60,602.341,604.60,602.98,C,,,
2024-12-03 18:54:00,SPY,604.46,604.46,604.46,604.4600,393.0,21.0,604.460000,False,SPY,...,4,53,604.60,602.341,604.60,602.98,C,,,
2024-12-03 18:55:00,SPY,604.46,604.46,604.46,604.4600,393.0,21.0,604.460000,False,SPY,...,5,54,604.60,602.341,604.60,602.98,C,,,
2024-12-03 18:56:00,SPY,604.51,604.51,604.51,604.5100,427.0,21.0,604.510000,False,SPY,...,0,55,604.60,602.341,604.60,602.98,C,,,


In [15]:
import boto3
from misc.utils import read_df, load_config, log

bucket_name = 'sisyphus-general-bucket'
prefix = 'AthenaInsights/latest_data'
local_directory = 'results'
results_file_path = 'results/models.txt'
forbidden_subdirectories = ['csv', 'parquet']

def list_subfolders(bucket_name, prefix, client):
    subfolders = set()
    paginator = client.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter='/'):
        for prefix_info in page.get('CommonPrefixes', []):  # Filter the prefixes (subfolders)
            subfolder_path = prefix_info['Prefix']
            subfolder_name = subfolder_path.strip('/').split('/')[-1]
            subfolders.add(subfolder_name)
    return subfolders

session = boto3.Session()
s3 = session.client('s3')
subfolder_names = list_subfolders(bucket_name, prefix+'/', s3)
subfolder_names = subfolder_names - set(forbidden_subdirectories)
log(f'subfolder_names detected - {subfolder_names}')

subfolder_names detected - {'spy_30min_v1'}
