In [22]:
import boto3
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

### data

In [23]:
client = boto3.client('s3')

In [24]:
s3_prefix = 's3://'
bucket = 'sisyphus-general-bucket'
primary_folder = 'AthenaInsights'
data_folder = 'latest_data'

In [25]:
response = client.list_objects_v2(
    Bucket=bucket,
    Prefix=f'{primary_folder}/{data_folder}/reduced_autocorelation/')

In [26]:
files = []
all_symbols = set()

In [27]:
paths = []
for content in response.get('Contents', []):
    # print(f"{s3_prefix}{bucket}/{content['Key']}")
    paths.append(f"{s3_prefix}{bucket}/{content['Key']}")

In [28]:
def calculate_ma(df, ema=True, sma=True, calc_windows=[], prices=[]):
    # Function to apply moving averages
    def apply_moving_averages(group):
        for window in tqdm(calc_windows):
            for price in prices:
                if ema:
                    group[f'{price}_ema_{window}m'] = group[price].ewm(span=window, adjust=False).mean()
                if sma:
                    group[f'{price}_sma_{window}m'] = group[price].rolling(window=window).mean()
        return group

    # Apply function by group
    return df.groupby('symbol').apply(apply_moving_averages)

In [29]:
col_sets = [['open', 'high', 'low', 'close', 'volume', 'symbol', 'time_delta'], 
            ['open_diff', 'high_diff', 'low_diff', 'close_diff', 'volume_diff', 'symbol', 'time_delta']]

In [None]:
for path in paths:
    name = path.split('/')[-1].split('.')[0].split('_')[-1]
    if 'min' in name:
        window = int(name.replace('min', ''))
        period = 'min'
    elif 'D' in name:
        window = int(name.replace('D', ''))
        period = 'D'
    if period=='D':
        if window < 100:
            calc_windows = [2, 3, 5, 8, 10, 12, 15, 20, 26, 50, 100, 200]
        else: 
            calc_windows = [5, 8, 10, 12, 20, 26, 50]
    else:
        if window <= 30:
            calc_windows = [2, 3, 5, 8, 12, 10, 20, 26]
        else:
            calc_windows = [5, 8, 10, 12, 20, 26, 50, 100, 200]
    print(path)
    df = pd.read_parquet(path)
    df1 = calculate_ma(df[col_sets[0]], calc_windows=calc_windows, prices=col_sets[0][:-2]).droplevel(0)
    df1.to_parquet(path.replace('reduced_autocorelation', 'feature_prep').replace('.parquet', '_base.parquet'))
    print(f"written to {path.replace('reduced_autocorelation', 'feature_prep').replace('.parquet', '_base.parquet')}")    
    del df1
    df2 = calculate_ma(df[col_sets[1]], calc_windows=calc_windows, prices=col_sets[1][:-2]).droplevel(0)
    df2.to_parquet(path.replace('reduced_autocorelation', 'feature_prep').replace('.parquet', '_diff.parquet'))
    print(f"written to {path.replace('reduced_autocorelation', 'feature_prep').replace('.parquet', '_diff.parquet')}")    
    del df2

s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_100D.parquet


100%|██████████| 7/7 [00:00<00:00, 138.03it/s]
100%|██████████| 7/7 [00:00<00:00, 390.14it/s]
100%|██████████| 7/7 [00:00<00:00, 232.45it/s]
100%|██████████| 7/7 [00:00<00:00, 545.13it/s]
100%|██████████| 7/7 [00:00<00:00, 212.47it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_100D_base.parquet


100%|██████████| 7/7 [00:00<00:00, 288.98it/s]
100%|██████████| 7/7 [00:00<00:00, 298.92it/s]
100%|██████████| 7/7 [00:00<00:00, 485.48it/s]
100%|██████████| 7/7 [00:00<00:00, 227.83it/s]
100%|██████████| 7/7 [00:00<00:00, 376.42it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_100D_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_10D.parquet


100%|██████████| 12/12 [00:00<00:00, 237.05it/s]
100%|██████████| 12/12 [00:00<00:00, 279.16it/s]
100%|██████████| 12/12 [00:00<00:00, 321.11it/s]
100%|██████████| 12/12 [00:00<00:00, 358.04it/s]
100%|██████████| 12/12 [00:00<00:00, 380.76it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_10D_base.parquet


100%|██████████| 12/12 [00:00<00:00, 298.83it/s]
100%|██████████| 12/12 [00:00<00:00, 495.69it/s]
100%|██████████| 12/12 [00:00<00:00, 329.23it/s]
100%|██████████| 12/12 [00:00<00:00, 364.27it/s]
100%|██████████| 12/12 [00:00<00:00, 328.06it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_10D_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_10min.parquet


100%|██████████| 8/8 [00:00<00:00, 90.98it/s]
100%|██████████| 8/8 [00:00<00:00, 105.75it/s]
100%|██████████| 8/8 [00:00<00:00, 106.17it/s]
100%|██████████| 8/8 [00:00<00:00, 107.63it/s]
100%|██████████| 8/8 [00:00<00:00, 96.09it/s]


In [None]:
print(path)
df = pd.read_parquet(path)
df

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open,time_delta,open_diff,high_diff,low_diff,close_diff,volume_diff,vwap_diff
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-02 07:00:00,DAX,2024-01-02 12:00:00+00:00,30.800,30.8000,30.8000,30.8000,100.0,1.0,30.800000,2024-01-02,False,,,,,,,
2024-01-02 09:00:00,DAX,2024-01-02 14:00:00+00:00,30.650,30.7000,30.5410,30.7000,5091.0,38.0,30.599758,2024-01-02,False,,-0.150,-0.1000,-0.2590,-0.1000,4991.0,-0.200242
2024-01-02 10:00:00,DAX,2024-01-02 15:00:00+00:00,30.651,30.6900,30.6200,30.6775,700.0,6.0,30.659786,2024-01-02,True,,0.001,-0.0100,0.0790,-0.0225,-4391.0,0.060028
2024-01-02 11:00:00,DAX,2024-01-02 16:00:00+00:00,30.630,30.7000,30.6300,30.7000,9456.0,3.0,30.689006,2024-01-02,True,,-0.021,0.0100,0.0100,0.0225,8756.0,0.029220
2024-01-02 12:00:00,DAX,2024-01-02 17:00:00+00:00,30.640,30.6900,30.6400,30.6900,853.0,2.0,30.678394,2024-01-02,True,,0.010,-0.0100,0.0100,-0.0100,-8603.0,-0.010612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11-19 12:00:00,VIXM,2024-11-19 17:00:00+00:00,14.110,14.1303,14.0900,14.1190,7818.0,22.0,14.113324,2024-11-19,True,,-0.090,-0.0697,-0.0220,-0.0010,-9918.0,-0.040061
2024-11-19 13:00:00,VIXM,2024-11-19 18:00:00+00:00,14.140,14.1400,14.0301,14.0301,10354.0,68.0,14.098863,2024-11-19,True,,0.030,0.0097,-0.0599,-0.0889,2536.0,-0.014461
2024-11-19 14:00:00,VIXM,2024-11-19 19:00:00+00:00,14.035,14.1250,14.0302,14.0700,9192.0,74.0,14.062300,2024-11-19,True,,-0.105,-0.0150,0.0001,0.0399,-1162.0,-0.036563
2024-11-19 15:00:00,VIXM,2024-11-19 20:00:00+00:00,14.090,14.1300,14.0701,14.1300,10410.0,216.0,14.099419,2024-11-19,True,,0.055,0.0050,0.0399,0.0600,1218.0,0.037119


100%|██████████| 9/9 [00:00<00:00, 240.21it/s]
100%|██████████| 9/9 [00:00<00:00, 316.30it/s]
100%|██████████| 9/9 [00:00<00:00, 279.65it/s]
100%|██████████| 9/9 [00:00<00:00, 249.60it/s]
100%|██████████| 9/9 [00:00<00:00, 277.33it/s]
100%|██████████| 9/9 [00:00<00:00, 282.39it/s]
100%|██████████| 9/9 [00:00<00:00, 282.60it/s]
100%|██████████| 9/9 [00:00<00:00, 265.73it/s]
100%|██████████| 9/9 [00:00<00:00, 186.90it/s]
100%|██████████| 9/9 [00:00<00:00, 285.18it/s]


In [None]:
df12

Unnamed: 0_level_0,open_diff,high_diff,low_diff,close_diff,volume_diff,symbol,time_delta,open_diff_ema_5m,open_diff_sma_5m,high_diff_ema_5m,...,open_diff_ema_200m,open_diff_sma_200m,high_diff_ema_200m,high_diff_sma_200m,low_diff_ema_200m,low_diff_sma_200m,close_diff_ema_200m,close_diff_sma_200m,volume_diff_ema_200m,volume_diff_sma_200m
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-02 07:00:00,,,,,,DAX,,,,,...,,,,,,,,,,
2024-01-02 09:00:00,-0.150,-0.1000,-0.2590,-0.1000,4991.0,DAX,,-0.150000,,-0.100000,...,-0.150000,,-0.100000,,-0.259000,,-0.100000,,4991.000000,
2024-01-02 10:00:00,0.001,-0.0100,0.0790,-0.0225,-4391.0,DAX,,-0.099667,,-0.070000,...,-0.148498,,-0.099104,,-0.255637,,-0.099229,,4897.646766,
2024-01-02 11:00:00,-0.021,0.0100,0.0100,0.0225,8756.0,DAX,,-0.073444,,-0.043333,...,-0.147229,,-0.098019,,-0.252994,,-0.098018,,4936.038341,
2024-01-02 12:00:00,0.010,-0.0100,0.0100,-0.0100,-8603.0,DAX,,-0.045630,,-0.032222,...,-0.145664,,-0.097143,,-0.250377,,-0.097142,,4801.321541,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11-19 12:00:00,-0.090,-0.0697,-0.0220,-0.0010,-9918.0,VIXM,,-0.060111,-0.05200,-0.062431,...,-0.003931,-0.003350,-0.004109,-0.003248,-0.003695,-0.003201,-0.003800,-0.00323,-91.148848,28.240
2024-11-19 13:00:00,0.030,0.0097,-0.0599,-0.0889,2536.0,VIXM,,-0.030074,-0.03400,-0.038388,...,-0.003593,-0.003250,-0.003971,-0.003500,-0.004254,-0.003500,-0.004647,-0.00350,-65.008063,-17.420
2024-11-19 14:00:00,-0.105,-0.0150,0.0001,0.0399,-1162.0,VIXM,,-0.055049,-0.08300,-0.030592,...,-0.004602,-0.003625,-0.004081,-0.003175,-0.004211,-0.003599,-0.004204,-0.00340,-75.923406,39.960
2024-11-19 15:00:00,0.055,0.0050,0.0399,0.0600,1218.0,VIXM,,-0.018366,-0.04206,-0.018728,...,-0.004009,-0.003050,-0.003991,-0.003250,-0.003772,-0.003149,-0.003565,-0.00320,-63.048546,31.245
