### config

In [39]:
import yaml
import boto3
import os
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
import pyarrow.dataset as ds
import pyarrow.parquet as pq
warnings.filterwarnings("ignore")

In [2]:
import os
os.getcwd()

'/home/ec2-user/SageMaker/sisyphus/AthenaInsights/notebooks/data_cleaning'

In [3]:
def load_yaml(loc):
    with open(loc) as stream:
        try:
            print(f'reading from {loc}')
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

In [4]:
all_paths_config_loc = '../../config/spy_30min_v1/features.yaml'
all_paths_config = load_yaml(all_paths_config_loc)
all_paths_config.keys()

reading from ../../config/spy_30min_v1/features.yaml


dict_keys(['dependent_variable', 'feature_store'])

In [5]:
all_paths_config['feature_store']['1min'].keys()

dict_keys(['stock_bars_1min_base', 'stock_bars_1min_base_macd', 'stock_bars_1min_base_other_features', 'stock_bars_1min_base_rsi', 'stock_bars_1min_diff', 'stock_bars_1min_diff_macd', 'stock_bars_1min_diff_rsi'])

In [6]:
all_paths_config['feature_store']['100D'].keys()

dict_keys(['stock_bars_100D_base', 'stock_bars_100D_base_macd', 'stock_bars_100D_base_rsi', 'stock_bars_100D_diff', 'stock_bars_100D_diff_macd', 'stock_bars_100D_diff_rsi'])

### independent variables

In [7]:
try:
    cols = eval(all_paths_config['feature_store']['1min']['stock_bars_1min_base']['cols'])
except:
    cols = all_paths_config['feature_store']['1min']['stock_bars_1min_base']['cols']
path = all_paths_config['feature_store']['1min']['stock_bars_1min_base']['path']
path, len(cols)

('s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_1min_base.parquet',
 87)

In [8]:
one_min_base_df = pq.read_pandas(path, columns=["open", "high", "low", "close", "volume", "symbol"]).to_pandas() # pq.read_pandas('example.parquet', columns=['two']).to_pandas()
one_min_base_df = one_min_base_df[one_min_base_df.symbol=='SPY']
one_min_base_df.head()

Unnamed: 0_level_0,open,high,low,close,volume,symbol
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-01-02 04:00:00,476.25,476.36,476.0,476.31,20460.0,SPY
2024-01-02 04:01:00,476.34,476.34,476.29,476.29,6369.0,SPY
2024-01-02 04:02:00,476.29,476.29,476.28,476.28,6152.0,SPY
2024-01-02 04:03:00,476.27,476.27,476.27,476.27,369.0,SPY
2024-01-02 04:04:00,476.27,476.27,476.27,476.27,369.0,SPY


In [9]:
# spy_1m_df_loc = f'{s3_prefix}{bucket}/{primary_folder}/data/data_prep/stock_bars_1min.parquet'
spy_1m_df_loc = 's3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_1min_diff.parquet'
spy_1m_df = pd.read_parquet(spy_1m_df_loc)
spy_1m_df = spy_1m_df[spy_1m_df.symbol=='SPY']

In [10]:
spy_1m_df.head()

Unnamed: 0_level_0,open_diff,high_diff,low_diff,close_diff,volume_diff,symbol,time_delta,open_diff_ema_2m,open_diff_sma_2m,high_diff_ema_2m,...,open_diff_ema_26m,open_diff_sma_26m,high_diff_ema_26m,high_diff_sma_26m,low_diff_ema_26m,low_diff_sma_26m,close_diff_ema_26m,close_diff_sma_26m,volume_diff_ema_26m,volume_diff_sma_26m
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-02 04:00:00,,,,,,SPY,,,,,...,,,,,,,,,,
2024-01-02 04:01:00,0.09,-0.02,0.29,-0.02,-14091.0,SPY,,0.09,,-0.02,...,0.09,,-0.02,,0.29,,-0.02,,-14091.0,
2024-01-02 04:02:00,-0.05,-0.05,-0.01,-0.01,-217.0,SPY,,-0.003333,0.02,-0.04,...,0.07963,,-0.022222,,0.267778,,-0.019259,,-13063.296296,
2024-01-02 04:03:00,-0.02,-0.02,-0.01,-0.01,-5783.0,SPY,,-0.014444,-0.035,-0.026667,...,0.07225,,-0.022058,,0.247202,,-0.018573,,-12524.015089,
2024-01-02 04:04:00,0.0,0.0,0.0,0.0,0.0,SPY,,-0.004815,-0.01,-0.008889,...,0.066898,,-0.020424,,0.22889,,-0.017198,,-11596.310268,


In [11]:
spy_1m_df.shape, one_min_base_df.shape

((214794, 87), (214794, 6))

In [12]:
spy_1m_df = pd.concat([one_min_base_df, spy_1m_df], axis=1)

In [13]:
spy_1m_df.shape

(214794, 93)

In [14]:
# Check if the Datetime index is sorted
if spy_1m_df.index.is_monotonic_increasing:
    print("The index is sorted.")
else:
    print("The index is not sorted. Sorting now.")
    spy_1m_df.sort_index(inplace=True)

The index is sorted.


#### lags

In [15]:
for lags in range(1, 16):
    for price in ['open_diff', 'high_diff', 'low_diff', 'close_diff']:
        spy_1m_df[f'{price}_lag_{lags}'] = spy_1m_df[f'{price}'].shift(lags)

In [16]:
spy_1m_df.shape

(214794, 153)

#### direction changes

In [17]:
def calculate_direction_changes(df, price_column='close'):
    df['price_change'] = df[price_column].diff()
    df['direction'] = df['price_change'].apply(lambda x: 'up' if x > 0 else 'down' if x < 0 else 'no change')
    df['direction_change'] = df['direction'].diff().ne(0) & df['direction'].ne('no change')
    direction_changes = df.groupby(['symbol', pd.Grouper(freq='D')])['direction_change'].sum().reset_index()
    direction_changes.rename(columns={'direction_change': 'daily_direction_changes'}, inplace=True)
    return direction_changes

In [18]:
spy_1m_df_backup = spy_1m_df.copy()
# spy_1m_df = spy_1m_df_backup.copy()

In [19]:
# calculate_direction_changes(spy_1m_df[['close']])
spy_1m_df['us_eastern_timestamp'] = spy_1m_df.index
spy_1m_df['market_open'] = (spy_1m_df.us_eastern_timestamp.dt.time>=pd.to_datetime('09:30:00').time()) & (spy_1m_df.us_eastern_timestamp.dt.time < pd.to_datetime('16:00:00').time())

In [20]:
spy_1m_df['date'] = spy_1m_df.us_eastern_timestamp.dt.date
spy_1m_df['price_change'] = spy_1m_df['close'].diff()
spy_1m_df['direction'] = spy_1m_df['price_change'].apply(lambda x: 'up' if x > 0 else 'down' if x < 0 else 'no change')
spy_1m_df['direction_prev'] = spy_1m_df['direction'].shift()
spy_1m_df['cumulative_ups'] = spy_1m_df['direction']=='up'
spy_1m_df['cumulative_ups'] = spy_1m_df.groupby('date').cumulative_ups.cumsum()
spy_1m_df['cumulative_downs'] = spy_1m_df['direction']=='down'
spy_1m_df['cumulative_downs'] = spy_1m_df.groupby('date').cumulative_downs.cumsum()
spy_1m_df['direction_change_up_to_down'] = (spy_1m_df.direction=='down')&(spy_1m_df.direction_prev=='up')
spy_1m_df['direction_change_down_to_up'] = (spy_1m_df.direction=='up')&(spy_1m_df.direction_prev=='down')
spy_1m_df['prev_date'] = spy_1m_df['date'].shift()
spy_1m_df['prev_cumulative_ups'] = spy_1m_df['cumulative_ups'].shift()
spy_1m_df['prev_cumulative_downs'] = spy_1m_df['cumulative_downs'].shift()
spy_1m_df['prev_cumulative_ups'] = np.where(spy_1m_df.prev_date==spy_1m_df.date, np.nan, spy_1m_df.prev_cumulative_ups)
spy_1m_df['prev_cumulative_downs'] = np.where(spy_1m_df.prev_date==spy_1m_df.date, np.nan, spy_1m_df.prev_cumulative_downs)
spy_1m_df['prev_cumulative_ups'] = spy_1m_df['prev_cumulative_ups'].ffill()
spy_1m_df['prev_cumulative_downs'] = spy_1m_df['prev_cumulative_downs'].ffill()
# spy_1m_df[['date', 'prev_date', 'cumulative_ups', 'prev_cumulative_ups']]
# spy_1m_df[['date', 'prev_date', 'cumulative_downs', 'prev_cumulative_downs']]
spy_1m_df.drop(columns=['direction_prev', 'date', 'prev_date', 'symbol', 'time_delta'], inplace=True)
spy_1m_df.tail(13)

Unnamed: 0_level_0,open,high,low,close,volume,open_diff,high_diff,low_diff,close_diff,volume_diff,...,us_eastern_timestamp,market_open,price_change,direction,cumulative_ups,cumulative_downs,direction_change_up_to_down,direction_change_down_to_up,prev_cumulative_ups,prev_cumulative_downs
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-11-19 19:47:00,591.63,591.63,591.63,591.63,250.0,-0.03,-0.03,-0.02,-0.02,-451.0,...,2024-11-19 19:47:00,False,-0.02,down,382,359,True,False,391.0,363.0
2024-11-19 19:48:00,591.67,591.67,591.67,591.67,159.0,0.04,0.04,0.04,0.04,-91.0,...,2024-11-19 19:48:00,False,0.04,up,383,359,False,True,391.0,363.0
2024-11-19 19:49:00,591.67,591.67,591.67,591.67,159.0,0.0,0.0,0.0,0.0,0.0,...,2024-11-19 19:49:00,False,0.0,no change,383,359,False,False,391.0,363.0
2024-11-19 19:50:00,591.68,591.68,591.63,591.63,1473.0,0.01,0.01,-0.04,-0.04,1314.0,...,2024-11-19 19:50:00,False,-0.04,down,383,360,False,False,391.0,363.0
2024-11-19 19:51:00,591.6,591.6099,591.6,591.6099,963.0,-0.08,-0.0701,-0.03,-0.0201,-510.0,...,2024-11-19 19:51:00,False,-0.0201,down,383,361,False,False,391.0,363.0
2024-11-19 19:52:00,591.58,591.58,591.58,591.58,597.0,-0.02,-0.0299,-0.02,-0.0299,-366.0,...,2024-11-19 19:52:00,False,-0.0299,down,383,362,False,False,391.0,363.0
2024-11-19 19:53:00,591.66,591.66,591.55,591.55,1559.0,0.08,0.08,-0.03,-0.03,962.0,...,2024-11-19 19:53:00,False,-0.03,down,383,363,False,False,391.0,363.0
2024-11-19 19:54:00,591.53,591.53,591.5,591.52,1052.0,-0.13,-0.13,-0.05,-0.03,-507.0,...,2024-11-19 19:54:00,False,-0.03,down,383,364,False,False,391.0,363.0
2024-11-19 19:55:00,591.53,591.53,591.5,591.52,1052.0,0.0,0.0,0.0,0.0,0.0,...,2024-11-19 19:55:00,False,0.0,no change,383,364,False,False,391.0,363.0
2024-11-19 19:56:00,591.52,591.54,591.48,591.48,3823.0,-0.01,0.01,-0.02,-0.04,2771.0,...,2024-11-19 19:56:00,False,-0.04,down,383,365,False,False,391.0,363.0


In [21]:
spy_1m_df['hour'] = spy_1m_df.us_eastern_timestamp.dt.hour
spy_1m_df['minute'] = spy_1m_df.us_eastern_timestamp.dt.minute
spy_1m_df['day_of_year'] = spy_1m_df.us_eastern_timestamp.dt.day_of_year

In [22]:
fields = []
for window_size in [5, 10, 15, 30]:
    spy_1m_df[f'local_max_{window_size}'] = (spy_1m_df['close'] >= spy_1m_df['close'].rolling(window=window_size, closed='left').max())
    spy_1m_df[f'local_min_{window_size}'] = (spy_1m_df['close'] <= spy_1m_df['close'].rolling(window=window_size, closed='left').min())

    # Cumulative count of rolling local maxima and minima
    spy_1m_df[f'cumulative_local_max_{window_size}'] = spy_1m_df[f'local_max_{window_size}'].cumsum()
    spy_1m_df[f'cumulative_local_min_{window_size}'] = spy_1m_df[f'local_min_{window_size}'].cumsum()

    distance_to_last_one = (spy_1m_df[f'local_max_{window_size}'].groupby((spy_1m_df[f'local_max_{window_size}'] == 1).cumsum()).cumcount()) * (spy_1m_df[f'local_max_{window_size}'] == 0) 
    spy_1m_df[f'time_since_prev_max_{window_size}'] = distance_to_last_one

    distance_to_last_one = (spy_1m_df[f'local_min_{window_size}'].groupby((spy_1m_df[f'local_min_{window_size}'] == 1).cumsum()).cumcount()) * (spy_1m_df[f'local_min_{window_size}'] == 0) 
    spy_1m_df[f'time_since_prev_min_{window_size}'] = distance_to_last_one
    fields += [f'local_max_{window_size}', f'local_min_{window_size}', f'cumulative_local_max_{window_size}', f'cumulative_local_min_{window_size}', f'time_since_prev_max_{window_size}', f'time_since_prev_min_{window_size}']

In [23]:
spy_1m_df[['close'] + fields].tail()

Unnamed: 0_level_0,close,local_max_5,local_min_5,cumulative_local_max_5,cumulative_local_min_5,time_since_prev_max_5,time_since_prev_min_5,local_max_10,local_min_10,cumulative_local_max_10,...,cumulative_local_max_15,cumulative_local_min_15,time_since_prev_max_15,time_since_prev_min_15,local_max_30,local_min_30,cumulative_local_max_30,cumulative_local_min_30,time_since_prev_max_30,time_since_prev_min_30
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-11-19 19:55:00,591.52,False,True,68985,64722,6,0,False,True,48468,...,39306,34643,6,0,False,False,27193,23140,6,16
2024-11-19 19:56:00,591.48,False,True,68985,64723,7,0,False,True,48468,...,39306,34644,7,0,False,False,27193,23140,7,17
2024-11-19 19:57:00,591.42,False,True,68985,64724,8,0,False,True,48468,...,39306,34645,8,0,False,False,27193,23140,8,18
2024-11-19 19:58:00,591.39,False,True,68985,64725,9,0,False,True,48468,...,39306,34646,9,0,False,False,27193,23140,9,19
2024-11-19 19:59:00,591.39,False,True,68985,64726,10,0,False,True,48468,...,39306,34647,10,0,False,False,27193,23140,10,20


In [24]:
spy_1m_df = spy_1m_df.drop(columns='us_eastern_timestamp')

In [25]:
# today's high till now
# today's low till now

In [26]:
spy_1m_df['max_today'] = spy_1m_df.groupby('day_of_year').high.cummax()
spy_1m_df['min_today'] = spy_1m_df.groupby('day_of_year').low.cummin()

spy_1m_df['max_today_session'] = spy_1m_df.groupby(['day_of_year', 'market_open']).high.cummax()
spy_1m_df['min_today_session'] = spy_1m_df.groupby(['day_of_year', 'market_open']).low.cummin()

In [27]:
spy_1m_df_loc = 's3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_1min_diff_other_features.parquet'
spy_1m_df.to_parquet(spy_1m_df_loc)

### updating config

In [43]:
def load_yaml(loc):
    with open(loc) as stream:
        try:
            print(f'reading from {loc}')
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

def get_path_info(path):
    name = path.split('/')[-1].replace('.parquet', '')
    time_period = name.replace('stock_bars_', '').split('_')[0]
    df = ds.dataset(path).scanner().head(1).to_pandas()
    columns = list(df.columns)
    return [name, time_period, df, columns]

def get_paths(s3_prefix = 's3://', bucket = 'sisyphus-general-bucket', primary_folder = 'AthenaInsights', loc='latest_data/feature_prep'):
    client = boto3.client('s3')
    response = client.list_objects_v2(
        Bucket=bucket,
        Prefix=f'{primary_folder}/{loc}/')

    paths = []
    for content in response.get('Contents', []):
        paths.append(f"{s3_prefix}{bucket}/{content['Key']}")
    return paths

def update_config(path):
    if not os.path.exists('all_info_independent_vars.csv'):
        paths = get_paths()
        all_info_independent_vars = pd.DataFrame(columns=['name', 'time_period', 'path', 'cols'])

        for path in tqdm(paths):
            name, time_period, df, columns = get_path_info(path)
            all_info_independent_vars.loc[all_info_independent_vars.shape[0]] = [name, time_period, path, columns]
    else:
        print('readinfg from local all_info_independent_vars.csv')
        all_info_independent_vars = pd.read_csv('all_info_independent_vars.csv')
        name, time_period, df, columns = get_path_info(path)
        if all_info_independent_vars[all_info_independent_vars.path==path].empty:
            all_info_independent_vars.loc[all_info_independent_vars.shape[0]] = [name, time_period, path, columns]
        else:
            all_info_independent_vars.loc[all_info_independent_vars.path==path] = [name, time_period, path, columns]
    all_info_independent_vars.to_csv('all_info_independent_vars.csv', index=False)
    
    all_paths_config_loc = '../../config/spy_30min_v1/features.yaml'
    all_paths_config = load_yaml(all_paths_config_loc)
    all_paths_config['feature_store'][time_period][name] = {}
    all_paths_config['feature_store'][time_period][name]['path'] = path
    all_paths_config['feature_store'][time_period][name]['cols'] = columns
    with open('../../config/spy_30min_v1/features.yaml', 'w+') as ff:
        yaml.dump(all_paths_config, ff)

In [44]:
spy_1m_df_loc = 's3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_1min_diff_other_features.parquet'

In [45]:
update_config(spy_1m_df_loc)

readinfg from local all_info_independent_vars.csv
reading from ../../config/spy_30min_v1/features.yaml


In [34]:
# all_paths_config_loc = '../../config/spy_30min_v1/features.yaml'
# all_paths_config = load_yaml(all_paths_config_loc)
all_paths_config['feature_store']['1min']['stock_bars_1min_base']['path']

's3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_1min_base.parquet'

In [30]:
# spy_1m_df.to_parquet('s3://sisyphus-general-bucket/AthenaInsights/latest_data/feature_prep/stock_bars_1min_base_other_features.parquet')