In [1]:
import boto3
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

### data

In [2]:
client = boto3.client('s3')

In [3]:
s3_prefix = 's3://'
bucket = 'sisyphus-general-bucket'
primary_folder = 'AthenaInsights'

In [4]:
response = client.list_objects_v2(
    Bucket=bucket,
    Prefix=f'{primary_folder}/data/data_prep/')

In [5]:
files = []
all_symbols = set()

In [6]:
paths = []
for content in response.get('Contents', []):
    # print(f"{s3_prefix}{bucket}/{content['Key']}")
    paths.append(f"{s3_prefix}{bucket}/{content['Key']}")

In [7]:
def calculate_ma(df, ema=True, sma=True, calc_windows=[]):
    # Function to apply moving averages
    def apply_moving_averages(group):
        for window in tqdm(calc_windows):
            for price in ['open', 'high', 'low', 'close', 'volume']:
                if ema:
                    group[f'{price}_ema_{window}m'] = group[price].ewm(span=window, adjust=False).mean()
                if sma:
                    group[f'{price}_sma_{window}m'] = group[price].rolling(window=window).mean()
        return group

    # Apply function by group
    return df.groupby('symbol').apply(apply_moving_averages)

In [8]:
for path in paths:
    name = path.split('/')[-1].split('.')[0].split('_')[-1]
    if 'min' in name:
        window = int(name.replace('min', ''))
        period = 'min'
    elif 'D' in name:
        window = int(name.replace('D', ''))
        period = 'D'
    if period=='D':
        if window < 100:
            calc_windows = [2, 3, 5, 8, 10, 12, 15, 20, 26, 50, 100, 200]
        else: 
            calc_windows = [5, 8, 10, 12, 20, 26, 50]
    else:
        if window <= 30:
            calc_windows = [2, 3, 5, 8, 12, 10, 20, 26]
        else:
            calc_windows = [5, 8, 10, 12, 20, 26, 50, 100, 200]
    print(path)
    df = pd.read_parquet(path)
    df = calculate_ma(df, calc_windows=calc_windows).droplevel(0)
    df.to_parquet(path.replace('data_prep', 'feature_prep'))
    print(f"written to {path.replace('data_prep', 'feature_prep')}")    

s3://sisyphus-general-bucket/AthenaInsights/data/data_prep/stock_bars_100D.parquet


100%|██████████| 7/7 [00:00<00:00, 72.02it/s]
100%|██████████| 7/7 [00:00<00:00, 135.13it/s]
100%|██████████| 7/7 [00:00<00:00, 127.20it/s]
100%|██████████| 7/7 [00:00<00:00, 101.74it/s]
100%|██████████| 7/7 [00:00<00:00, 114.39it/s]


                         open     high      low    close     volume symbol  \
us_eastern_timestamp                                                         
2020-01-02            28.5375  28.6631  17.0000  22.1176  1009552.0    DAX   
2020-04-11            22.4500  27.9883  20.7364  27.8300  1353828.0    DAX   
2020-07-20            27.9200  29.7200  26.5624  26.6800  1243774.0    DAX   
2020-10-28            25.6400  32.0299  25.1275  31.0250   975409.0    DAX   
2021-02-05            31.1600  34.6100  30.6000  34.5538   699338.0    DAX   

                      open_ema_5m  open_sma_5m  high_ema_5m  high_sma_5m  ...  \
us_eastern_timestamp                                                      ...   
2020-01-02              28.537500          NaN    28.663100          NaN  ...   
2020-04-11              26.508333          NaN    28.438167          NaN  ...   
2020-07-20              26.978889          NaN    28.865444          NaN  ...   
2020-10-28              26.532593          NaN  

100%|██████████| 12/12 [00:00<00:00, 113.55it/s]
100%|██████████| 12/12 [00:00<00:00, 91.88it/s]
100%|██████████| 12/12 [00:00<00:00, 92.43it/s]
100%|██████████| 12/12 [00:00<00:00, 111.23it/s]
100%|██████████| 12/12 [00:00<00:00, 92.98it/s]


                         open     high     low  close   volume symbol  \
us_eastern_timestamp                                                    
2020-01-02            28.5375  28.6631  27.800  28.50  14355.0    DAX   
2020-01-12            28.2400  28.5700  28.225  28.48  44010.0    DAX   
2020-01-22            28.4000  28.6499  27.100  27.10  57643.0    DAX   
2020-02-01            27.2300  28.3900  27.230  27.95  20109.0    DAX   
2020-02-11            28.1600  28.5599  27.940  27.94  18150.0    DAX   

                      open_ema_2m  open_sma_2m  high_ema_2m  high_sma_2m  ...  \
us_eastern_timestamp                                                      ...   
2020-01-02              28.537500          NaN    28.663100          NaN  ...   
2020-01-12              28.339167     28.38875    28.601033     28.61655  ...   
2020-01-22              28.379722     28.32000    28.633611     28.60995  ...   
2020-02-01              27.613241     27.81500    28.471204     28.51995  ...   
20

100%|██████████| 7/7 [00:00<00:00, 110.85it/s]
100%|██████████| 7/7 [00:00<00:00, 92.90it/s]
100%|██████████| 7/7 [00:00<00:00, 131.10it/s]
100%|██████████| 7/7 [00:00<00:00, 167.95it/s]
100%|██████████| 7/7 [00:00<00:00, 152.39it/s]


                         open     high      low    close     volume symbol  \
us_eastern_timestamp                                                         
2020-01-02            28.5375  28.6631  17.0000  24.5780  1459464.0    DAX   
2020-05-31            24.8600  29.7200  24.8350  26.6800  2147690.0    DAX   
2020-10-28            25.6400  32.6700  25.1275  32.2999  1262814.0    DAX   
2021-03-27            32.3200  35.3999  32.1100  33.7700  1231875.0    DAX   
2021-08-24            33.7700  34.0900  30.6850  31.9200  1399074.0    DAX   

                      open_ema_5m  open_sma_5m  high_ema_5m  high_sma_5m  ...  \
us_eastern_timestamp                                                      ...   
2020-01-02              28.537500          NaN    28.663100          NaN  ...   
2020-05-31              27.311667          NaN    29.015400          NaN  ...   
2020-10-28              26.754444          NaN    30.233600          NaN  ...   
2021-03-27              28.609630          NaN  

100%|██████████| 12/12 [00:00<00:00, 80.26it/s]
100%|██████████| 12/12 [00:00<00:00, 93.15it/s]
100%|██████████| 12/12 [00:00<00:00, 95.15it/s] 
100%|██████████| 12/12 [00:00<00:00, 149.27it/s]
100%|██████████| 12/12 [00:00<00:00, 70.91it/s]


                         open     high      low    close    volume symbol  \
us_eastern_timestamp                                                        
2020-01-02            28.5375  28.6631  27.8000  28.3801   45838.0    DAX   
2020-01-17            28.5600  28.6499  27.1000  27.1000   70170.0    DAX   
2020-02-01            27.2300  28.5599  27.2300  28.2900   31296.0    DAX   
2020-02-16            28.1100  28.3000  24.4729  24.9500   57433.0    DAX   
2020-03-02            25.0900  25.7500  17.8700  18.2900  231792.0    DAX   

                      open_ema_2m  open_sma_2m  high_ema_2m  high_sma_2m  ...  \
us_eastern_timestamp                                                      ...   
2020-01-02              28.537500          NaN    28.663100          NaN  ...   
2020-01-17              28.552500     28.54875    28.654300     28.65650  ...   
2020-02-01              27.670833     27.89500    28.591367     28.60490  ...   
2020-02-16              27.963611     27.67000    28.39

100%|██████████| 12/12 [00:00<00:00, 82.13it/s]
100%|██████████| 12/12 [00:00<00:00, 140.63it/s]
100%|██████████| 12/12 [00:00<00:00, 146.70it/s]
100%|██████████| 12/12 [00:00<00:00, 149.33it/s]
100%|██████████| 12/12 [00:00<00:00, 157.61it/s]


                     symbol                 timestamp     open     high  \
us_eastern_timestamp                                                      
2020-01-02              DAX 2020-01-02 05:00:00+00:00  28.5375  28.6631   
2020-01-03              DAX 2020-01-03 05:00:00+00:00  27.9800  28.1114   
2020-01-06              DAX 2020-01-06 05:00:00+00:00  27.8000  27.8948   
2020-01-07              DAX 2020-01-07 05:00:00+00:00  28.0400  28.0658   
2020-01-08              DAX 2020-01-08 05:00:00+00:00  28.1550  28.2122   

                          low    close  volume  trade_count       vwap  \
us_eastern_timestamp                                                     
2020-01-02            28.4050  28.4635  1596.0         31.0  28.483275   
2020-01-03            27.9701  27.9900  2848.0         25.0  28.027624   
2020-01-06            27.8000  27.8948   975.0         15.0  27.825705   
2020-01-07            28.0400  28.0400   665.0         21.0  28.050874   
2020-01-08            28.1550 

100%|██████████| 7/7 [00:00<00:00, 85.04it/s]
100%|██████████| 7/7 [00:00<00:00, 120.08it/s]
100%|██████████| 7/7 [00:00<00:00, 96.71it/s]
100%|██████████| 7/7 [00:00<00:00, 167.13it/s]
100%|██████████| 7/7 [00:00<00:00, 105.10it/s]


                         open     high      low   close     volume symbol  \
us_eastern_timestamp                                                        
2020-01-02            28.5375  28.6631  17.0000  27.830  2363380.0    DAX   
2020-07-20            27.9200  32.0299  25.1275  31.025  2219183.0    DAX   
2021-02-05            31.1600  35.3999  30.6000  33.770  1519280.0    DAX   
2021-08-24            33.7700  34.0900  24.5000  26.500  2302138.0    DAX   
2022-03-12            27.3900  29.8086  20.0350  20.130  3270963.0    DAX   

                      open_ema_5m  open_sma_5m  high_ema_5m  high_sma_5m  ...  \
us_eastern_timestamp                                                      ...   
2020-01-02              28.537500          NaN    28.663100          NaN  ...   
2020-07-20              28.331667          NaN    29.785367          NaN  ...   
2021-02-05              29.274444          NaN    31.656878          NaN  ...   
2021-08-24              30.772963          NaN    32.46

100%|██████████| 12/12 [00:00<00:00, 100.00it/s]
100%|██████████| 12/12 [00:00<00:00, 139.84it/s]
100%|██████████| 12/12 [00:00<00:00, 143.44it/s]
100%|██████████| 12/12 [00:00<00:00, 133.60it/s]
100%|██████████| 12/12 [00:00<00:00, 150.10it/s]


                         open     high      low    close    volume symbol  \
us_eastern_timestamp                                                        
2020-01-02            28.5375  28.6631  27.8000  28.4800   58365.0    DAX   
2020-01-22            28.4000  28.6499  27.1000  27.9500   77752.0    DAX   
2020-02-11            28.1600  28.5599  24.4729  24.9500   68620.0    DAX   
2020-03-02            25.0900  25.7500  17.0000  17.7299  368863.0    DAX   
2020-03-22            18.0900  22.3181  17.6500  22.1176  435952.0    DAX   

                      open_ema_2m  open_sma_2m  high_ema_2m  high_sma_2m  ...  \
us_eastern_timestamp                                                      ...   
2020-01-02              28.537500          NaN    28.663100          NaN  ...   
2020-01-22              28.445833     28.46875    28.654300     28.65650  ...   
2020-02-11              28.255278     28.28000    28.591367     28.60490  ...   
2020-03-02              26.145093     26.62500    26.69

100%|██████████| 12/12 [00:00<00:00, 107.29it/s]
100%|██████████| 12/12 [00:00<00:00, 99.52it/s]
100%|██████████| 12/12 [00:00<00:00, 148.56it/s]
100%|██████████| 12/12 [00:00<00:00, 87.85it/s]
100%|██████████| 12/12 [00:00<00:00, 150.66it/s]


                         open     high      low  close  volume symbol  \
us_eastern_timestamp                                                    
2020-01-02            28.5375  28.6631  27.9701  27.99  4444.0    DAX   
2020-01-04                NaN      NaN      NaN    NaN     0.0    DAX   
2020-01-06            27.8000  28.0658  27.8000  28.04  1640.0    DAX   
2020-01-08            28.1550  28.5100  28.1550  28.51  7483.0    DAX   
2020-01-10            28.5172  28.5172  28.5000  28.50   788.0    DAX   

                      open_ema_2m  open_sma_2m  high_ema_2m  high_sma_2m  ...  \
us_eastern_timestamp                                                      ...   
2020-01-02              28.537500          NaN    28.663100          NaN  ...   
2020-01-04              28.537500          NaN    28.663100          NaN  ...   
2020-01-06              27.905357          NaN    28.151129          NaN  ...   
2020-01-08              28.071786      27.9775    28.390376      28.2879  ...   
20

100%|██████████| 12/12 [00:00<00:00, 162.12it/s]
100%|██████████| 12/12 [00:00<00:00, 146.53it/s]
100%|██████████| 12/12 [00:00<00:00, 128.00it/s]
100%|██████████| 12/12 [00:00<00:00, 176.47it/s]
100%|██████████| 12/12 [00:00<00:00, 155.18it/s]


                         open     high      low   close    volume symbol  \
us_eastern_timestamp                                                       
2020-01-02            28.5375  28.6631  27.1000  27.100  116008.0    DAX   
2020-02-01            27.2300  28.5599  24.4729  24.950   88729.0    DAX   
2020-03-02            25.0900  25.7500  17.0000  20.460  493137.0    DAX   
2020-04-01            19.7700  23.1795  19.4000  22.730  565586.0    DAX   
2020-05-01            22.2400  24.9600  20.7364  24.578  196004.0    DAX   

                      open_ema_2m  open_sma_2m  high_ema_2m  high_sma_2m  ...  \
us_eastern_timestamp                                                      ...   
2020-01-02              28.537500          NaN    28.663100          NaN  ...   
2020-02-01              27.665833     27.88375    28.594300     28.61150  ...   
2020-03-02              25.948611     26.16000    26.698100     27.15495  ...   
2020-04-01              21.829537     22.43000    24.352367   

100%|██████████| 12/12 [00:00<00:00, 148.70it/s]
100%|██████████| 12/12 [00:00<00:00, 158.97it/s]
100%|██████████| 12/12 [00:00<00:00, 147.44it/s]
100%|██████████| 12/12 [00:00<00:00, 86.24it/s]
100%|██████████| 12/12 [00:00<00:00, 98.28it/s]


                         open     high      low    close   volume symbol  \
us_eastern_timestamp                                                       
2020-01-02            28.5375  28.6631  27.9701  27.9900   4444.0    DAX   
2020-01-05            27.8000  28.0658  27.8000  28.0400   1640.0    DAX   
2020-01-08            28.1550  28.5172  28.1550  28.5000   8271.0    DAX   
2020-01-11            28.2400  28.5300  28.2250  28.5300   3965.0    DAX   
2020-01-14            28.3419  28.5000  28.3158  28.3801  27518.0    DAX   

                      open_ema_2m  open_sma_2m  high_ema_2m  high_sma_2m  ...  \
us_eastern_timestamp                                                      ...   
2020-01-02              28.537500          NaN    28.663100          NaN  ...   
2020-01-05              28.045833     28.16875    28.264900     28.36445  ...   
2020-01-08              28.118611     27.97750    28.433100     28.29150  ...   
2020-01-11              28.199537     28.19750    28.497700   

100%|██████████| 12/12 [00:00<00:00, 125.70it/s]
100%|██████████| 12/12 [00:00<00:00, 174.92it/s]
100%|██████████| 12/12 [00:00<00:00, 171.40it/s]
100%|██████████| 12/12 [00:00<00:00, 162.81it/s]
100%|██████████| 12/12 [00:00<00:00, 185.44it/s]


                         open     high      low    close    volume symbol  \
us_eastern_timestamp                                                        
2020-01-02            28.5375  28.6631  27.1000  27.9400  154267.0    DAX   
2020-02-21            27.9418  27.9418  17.0000  22.1176  855285.0    DAX   
2020-04-11            22.4500  24.9600  20.7364  24.5780  449912.0    DAX   
2020-05-31            24.8600  27.9883  24.8350  27.8300  903916.0    DAX   
2020-07-20            27.9200  29.7200  26.9500  28.9399  910442.0    DAX   

                      open_ema_2m  open_sma_2m  high_ema_2m  high_sma_2m  ...  \
us_eastern_timestamp                                                      ...   
2020-01-02              28.537500          NaN    28.663100          NaN  ...   
2020-02-21              28.140367     28.23965    28.182233     28.30245  ...   
2020-04-11              24.346789     25.19590    26.034078     26.45090  ...   
2020-05-31              24.688930     23.65500    27.33

100%|██████████| 12/12 [00:00<00:00, 131.04it/s]
100%|██████████| 12/12 [00:00<00:00, 96.56it/s] 
100%|██████████| 12/12 [00:00<00:00, 112.29it/s]
100%|██████████| 12/12 [00:00<00:00, 160.78it/s]
100%|██████████| 12/12 [00:00<00:00, 153.30it/s]


                         open     high     low    close   volume symbol  \
us_eastern_timestamp                                                      
2020-01-02            28.5375  28.6631  27.800  27.8948   5419.0    DAX   
2020-01-07            28.0400  28.5172  28.040  28.5000   8936.0    DAX   
2020-01-12            28.2400  28.5300  28.225  28.3801  31483.0    DAX   
2020-01-17            28.5600  28.5700  28.320  28.4800  12527.0    DAX   
2020-01-22            28.4000  28.6499  28.035  28.2700  16493.0    DAX   

                      open_ema_2m  open_sma_2m  high_ema_2m  high_sma_2m  ...  \
us_eastern_timestamp                                                      ...   
2020-01-02              28.537500          NaN    28.663100          NaN  ...   
2020-01-07              28.205833     28.28875    28.565833     28.59015  ...   
2020-01-12              28.228611     28.14000    28.541944     28.52360  ...   
2020-01-17              28.449537     28.40000    28.560648     28.55