In [1]:
import boto3
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

### data

In [2]:
client = boto3.client('s3')

In [3]:
s3_prefix = 's3://'
bucket = 'sisyphus-general-bucket'
primary_folder = 'AthenaInsights'
data_folder = 'latest_data'

In [4]:
response = client.list_objects_v2(
    Bucket=bucket,
    Prefix=f'{primary_folder}/{data_folder}/reduced_autocorelation/')

In [5]:
files = []
all_symbols = set()

In [6]:
paths = []
for content in response.get('Contents', []):
    # print(f"{s3_prefix}{bucket}/{content['Key']}")
    paths.append(f"{s3_prefix}{bucket}/{content['Key']}")

In [7]:
def calculate_ma(df, ema=True, sma=True, calc_windows=[], prices=[]):
    # Function to apply moving averages
    def apply_moving_averages(group):
        for window in tqdm(calc_windows):
            for price in prices:
                if ema:
                    group[f'{price}_ema_{window}m'] = group[price].ewm(span=window, adjust=False).mean()
                if sma:
                    group[f'{price}_sma_{window}m'] = group[price].rolling(window=window).mean()
        return group

    # Apply function by group
    return df.groupby('symbol').apply(apply_moving_averages)

In [8]:
col_sets = [['open', 'high', 'low', 'close', 'volume', 'symbol', 'time_delta'], 
            ['open_diff', 'high_diff', 'low_diff', 'close_diff', 'volume_diff', 'symbol', 'time_delta']]

In [9]:
for path in paths:
    name = path.split('/')[-1].split('.')[0].split('_')[-1]
    if 'min' in name:
        window = int(name.replace('min', ''))
        period = 'min'
    elif 'D' in name:
        window = int(name.replace('D', ''))
        period = 'D'
    if period=='D':
        if window < 100:
            calc_windows = [2, 3, 5, 8, 10, 12, 15, 20, 26, 50, 100, 200]
        else: 
            calc_windows = [5, 8, 10, 12, 20, 26, 50]
    else:
        if window <= 30:
            calc_windows = [2, 3, 5, 8, 12, 10, 20, 26]
        else:
            calc_windows = [5, 8, 10, 12, 20, 26, 50, 100, 200]
    print(path)
    df = pd.read_parquet(path)
    df1 = calculate_ma(df[col_sets[0]], calc_windows=calc_windows, prices=col_sets[0][:-2]).droplevel(0)
    df1.to_parquet(path.replace('reduced_autocorelation', 'feature_prep').replace('.parquet', '_base.parquet'))
    print(f"written to {path.replace('reduced_autocorelation', 'feature_prep').replace('.parquet', '_base.parquet')}")    
    del df1
    df2 = calculate_ma(df[col_sets[1]], calc_windows=calc_windows, prices=col_sets[1][:-2]).droplevel(0)
    df2.to_parquet(path.replace('reduced_autocorelation', 'feature_prep').replace('.parquet', '_diff.parquet'))
    print(f"written to {path.replace('reduced_autocorelation', 'feature_prep').replace('.parquet', '_diff.parquet')}")    
    del df2

s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_100D.parquet


100%|██████████| 7/7 [00:00<00:00, 325.22it/s]
100%|██████████| 7/7 [00:00<00:00, 228.00it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_100D_base.parquet


100%|██████████| 7/7 [00:00<00:00, 366.49it/s]
100%|██████████| 7/7 [00:00<00:00, 211.98it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_100D_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_10D.parquet


100%|██████████| 12/12 [00:00<00:00, 208.19it/s]
100%|██████████| 12/12 [00:00<00:00, 301.67it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_10D_base.parquet


100%|██████████| 12/12 [00:00<00:00, 290.47it/s]
100%|██████████| 12/12 [00:00<00:00, 456.50it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_10D_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_10min.parquet


100%|██████████| 8/8 [00:00<00:00, 138.46it/s]
100%|██████████| 8/8 [00:00<00:00, 145.76it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_10min_base.parquet


100%|██████████| 8/8 [00:00<00:00, 123.82it/s]
100%|██████████| 8/8 [00:00<00:00, 136.34it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_10min_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_120min.parquet


100%|██████████| 9/9 [00:00<00:00, 293.60it/s]
100%|██████████| 9/9 [00:00<00:00, 284.13it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_120min_base.parquet


100%|██████████| 9/9 [00:00<00:00, 278.66it/s]
100%|██████████| 9/9 [00:00<00:00, 272.62it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_120min_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_150D.parquet


100%|██████████| 7/7 [00:00<00:00, 307.78it/s]
100%|██████████| 7/7 [00:00<00:00, 329.47it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_150D_base.parquet


100%|██████████| 7/7 [00:00<00:00, 192.23it/s]
100%|██████████| 7/7 [00:00<00:00, 319.23it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_150D_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_15D.parquet


100%|██████████| 12/12 [00:00<00:00, 223.58it/s]
100%|██████████| 12/12 [00:00<00:00, 316.32it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_15D_base.parquet


100%|██████████| 12/12 [00:00<00:00, 257.83it/s]
100%|██████████| 12/12 [00:00<00:00, 311.41it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_15D_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_15min.parquet


100%|██████████| 8/8 [00:00<00:00, 218.41it/s]
100%|██████████| 8/8 [00:00<00:00, 226.23it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_15min_base.parquet


100%|██████████| 8/8 [00:00<00:00, 148.66it/s]
100%|██████████| 8/8 [00:00<00:00, 166.36it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_15min_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_180min.parquet


100%|██████████| 9/9 [00:00<00:00, 315.01it/s]
100%|██████████| 9/9 [00:00<00:00, 291.12it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_180min_base.parquet


100%|██████████| 9/9 [00:00<00:00, 290.52it/s]
100%|██████████| 9/9 [00:00<00:00, 300.67it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_180min_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_1D.parquet


100%|██████████| 12/12 [00:00<00:00, 318.71it/s]
100%|██████████| 12/12 [00:00<00:00, 253.69it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_1D_base.parquet


100%|██████████| 12/12 [00:00<00:00, 281.38it/s]
100%|██████████| 12/12 [00:00<00:00, 311.63it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_1D_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_1min.parquet


100%|██████████| 8/8 [00:00<00:00, 19.21it/s]
100%|██████████| 8/8 [00:00<00:00, 18.95it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_1min_base.parquet


100%|██████████| 8/8 [00:00<00:00, 16.90it/s]
100%|██████████| 8/8 [00:00<00:00, 16.69it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_1min_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_200D.parquet


100%|██████████| 7/7 [00:00<00:00, 341.36it/s]
100%|██████████| 7/7 [00:00<00:00, 315.04it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_200D_base.parquet


100%|██████████| 7/7 [00:00<00:00, 328.87it/s]
100%|██████████| 7/7 [00:00<00:00, 318.44it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_200D_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_20D.parquet


100%|██████████| 12/12 [00:00<00:00, 295.10it/s]
100%|██████████| 12/12 [00:00<00:00, 279.53it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_20D_base.parquet


100%|██████████| 12/12 [00:00<00:00, 268.94it/s]
100%|██████████| 12/12 [00:00<00:00, 334.95it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_20D_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_20min.parquet


100%|██████████| 8/8 [00:00<00:00, 223.34it/s]
100%|██████████| 8/8 [00:00<00:00, 253.41it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_20min_base.parquet


100%|██████████| 8/8 [00:00<00:00, 192.45it/s]
100%|██████████| 8/8 [00:00<00:00, 250.16it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_20min_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_240min.parquet


100%|██████████| 9/9 [00:00<00:00, 289.78it/s]
100%|██████████| 9/9 [00:00<00:00, 280.32it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_240min_base.parquet


100%|██████████| 9/9 [00:00<00:00, 150.05it/s]
100%|██████████| 9/9 [00:00<00:00, 236.59it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_240min_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_25min.parquet


100%|██████████| 8/8 [00:00<00:00, 277.75it/s]
100%|██████████| 8/8 [00:00<00:00, 178.63it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_25min_base.parquet


100%|██████████| 8/8 [00:00<00:00, 167.79it/s]
100%|██████████| 8/8 [00:00<00:00, 233.15it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_25min_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_2D.parquet


100%|██████████| 12/12 [00:00<00:00, 302.42it/s]
100%|██████████| 12/12 [00:00<00:00, 466.71it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_2D_base.parquet


100%|██████████| 12/12 [00:00<00:00, 282.96it/s]
100%|██████████| 12/12 [00:00<00:00, 288.51it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_2D_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_2min.parquet


100%|██████████| 8/8 [00:00<00:00, 38.15it/s]
100%|██████████| 8/8 [00:00<00:00, 46.86it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_2min_base.parquet


100%|██████████| 8/8 [00:00<00:00, 39.00it/s]
100%|██████████| 8/8 [00:00<00:00, 36.59it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_2min_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_30D.parquet


100%|██████████| 12/12 [00:00<00:00, 407.24it/s]
100%|██████████| 12/12 [00:00<00:00, 248.70it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_30D_base.parquet


100%|██████████| 12/12 [00:00<00:00, 309.13it/s]
100%|██████████| 12/12 [00:00<00:00, 427.12it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_30D_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_30min.parquet


100%|██████████| 8/8 [00:00<00:00, 204.68it/s]
100%|██████████| 8/8 [00:00<00:00, 299.21it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_30min_base.parquet


100%|██████████| 8/8 [00:00<00:00, 221.19it/s]
100%|██████████| 8/8 [00:00<00:00, 229.74it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_30min_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_3D.parquet


100%|██████████| 12/12 [00:00<00:00, 238.86it/s]
100%|██████████| 12/12 [00:00<00:00, 295.17it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_3D_base.parquet


100%|██████████| 12/12 [00:00<00:00, 297.12it/s]
100%|██████████| 12/12 [00:00<00:00, 326.64it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_3D_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_3min.parquet


100%|██████████| 8/8 [00:00<00:00, 59.65it/s]
100%|██████████| 8/8 [00:00<00:00, 71.36it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_3min_base.parquet


100%|██████████| 8/8 [00:00<00:00, 62.26it/s]
100%|██████████| 8/8 [00:00<00:00, 56.19it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_3min_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_50D.parquet


100%|██████████| 12/12 [00:00<00:00, 307.43it/s]
100%|██████████| 12/12 [00:00<00:00, 307.13it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_50D_base.parquet


100%|██████████| 12/12 [00:00<00:00, 254.96it/s]
100%|██████████| 12/12 [00:00<00:00, 390.50it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_50D_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_5D.parquet


100%|██████████| 12/12 [00:00<00:00, 286.44it/s]
100%|██████████| 12/12 [00:00<00:00, 334.20it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_5D_base.parquet


100%|██████████| 12/12 [00:00<00:00, 179.51it/s]
100%|██████████| 12/12 [00:00<00:00, 346.57it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_5D_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_5min.parquet


100%|██████████| 8/8 [00:00<00:00, 85.68it/s]
100%|██████████| 8/8 [00:00<00:00, 101.14it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_5min_base.parquet


100%|██████████| 8/8 [00:00<00:00, 84.98it/s]
100%|██████████| 8/8 [00:00<00:00, 86.44it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_5min_diff.parquet
s3://sisyphus-general-bucket/AthenaInsights/latest_data/reduced_autocorelation/stock_bars_60min.parquet


100%|██████████| 9/9 [00:00<00:00, 257.83it/s]
100%|██████████| 9/9 [00:00<00:00, 378.81it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_60min_base.parquet


100%|██████████| 9/9 [00:00<00:00, 231.19it/s]
100%|██████████| 9/9 [00:00<00:00, 264.34it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_60min_diff.parquet
