In [1]:
import boto3
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

### data

In [2]:
client = boto3.client('s3')

In [3]:
s3_prefix = 's3://'
bucket = 'sisyphus-general-bucket'
primary_folder = 'AthenaInsights'

In [4]:
response = client.list_objects_v2(
    Bucket=bucket,
    Prefix=f'{primary_folder}/data/data_prep/')

In [5]:
files = []
all_symbols = set()

In [6]:
paths = []
for content in response.get('Contents', []):
    # print(f"{s3_prefix}{bucket}/{content['Key']}")
    paths.append(f"{s3_prefix}{bucket}/{content['Key']}")

In [7]:
def calculate_ma(df, ema=True, sma=True, calc_windows=[]):
    # Function to apply moving averages
    def apply_moving_averages(group):
        for window in tqdm(calc_windows):
            for price in ['open', 'high', 'low', 'close', 'volume']:
                if ema:
                    group[f'{price}_ema_{window}m'] = group[price].ewm(span=window, adjust=False).mean()
                if sma:
                    group[f'{price}_sma_{window}m'] = group[price].rolling(window=window).mean()
        return group

    # Apply function by group
    return df.groupby('symbol').apply(apply_moving_averages)

In [None]:
for path in paths:
    name = path.split('/')[-1].split('.')[0].split('_')[-1]
    if 'min' in name:
        window = int(name.replace('min', ''))
        period = 'min'
    elif 'D' in name:
        window = int(name.replace('D', ''))
        period = 'D'
    if period=='D':
        if window < 100:
            calc_windows = [2, 3, 5, 10, 15, 20, 50, 100, 200]
        else: 
            calc_windows = [5, 10, 20, 50]
    else:
        if window <= 30:
            calc_windows = [2, 3, 5, 10, 20]
        else:
            calc_windows = [5, 10, 20, 50, 100, 200]
    print(path)
    df = pd.read_parquet(path)
    df = calculate_ma(df, calc_windows=calc_windows).droplevel(0)
    df.to_parquet(path.replace('data_prep', 'feature_prep'))
    print(f"written to {path.replace('data_prep', 'feature_prep')}")    

s3://sisyphus-general-bucket/AthenaInsights/data/data_prep/stock_bars_100D.parquet


100%|██████████| 4/4 [00:00<00:00, 61.93it/s]
100%|██████████| 4/4 [00:00<00:00, 88.27it/s]
100%|██████████| 4/4 [00:00<00:00, 136.12it/s]
100%|██████████| 4/4 [00:00<00:00, 109.62it/s]
100%|██████████| 4/4 [00:00<00:00, 127.91it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/data/feature_prep/stock_bars_100D.parquet
s3://sisyphus-general-bucket/AthenaInsights/data/data_prep/stock_bars_10D.parquet


100%|██████████| 9/9 [00:00<00:00, 83.80it/s]
100%|██████████| 9/9 [00:00<00:00, 138.47it/s]
100%|██████████| 9/9 [00:00<00:00, 104.49it/s]
100%|██████████| 9/9 [00:00<00:00, 134.02it/s]
100%|██████████| 9/9 [00:00<00:00, 137.97it/s]


written to s3://sisyphus-general-bucket/AthenaInsights/data/feature_prep/stock_bars_10D.parquet
s3://sisyphus-general-bucket/AthenaInsights/data/data_prep/stock_bars_10min.parquet


100%|██████████| 5/5 [00:00<00:00, 13.80it/s]
100%|██████████| 5/5 [00:00<00:00, 22.05it/s]
100%|██████████| 5/5 [00:00<00:00, 14.25it/s]
100%|██████████| 5/5 [00:00<00:00, 13.93it/s]
100%|██████████| 5/5 [00:00<00:00, 13.41it/s]
