In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

### data

In [2]:
df1 = pd.read_parquet('stock_bars.parquet')
spy_df = df1[df1.symbol=='SPY']
spy_df['timestamp'] = pd.to_datetime(spy_df['timestamp'])
spy_df['us_eastern_timestamp'] = spy_df['timestamp'].dt.tz_convert('US/Eastern')
spy_df['us_eastern_date'] = spy_df.us_eastern_timestamp.dt.date
spy_df['market_open'] = spy_df.us_eastern_timestamp.between('09:30:00', '16:00:00')
spy_df.set_index('us_eastern_timestamp', inplace=True)

In [3]:
spy_df.head()

Unnamed: 0_level_0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-12-31 19:01:00-05:00,SPY,2020-01-01 00:01:00+00:00,322.36,322.36,322.36,322.36,1073.0,23.0,322.36,2019-12-31,False
2019-12-31 19:11:00-05:00,SPY,2020-01-01 00:11:00+00:00,322.38,322.38,322.38,322.38,300.0,6.0,322.38,2019-12-31,False
2019-12-31 19:12:00-05:00,SPY,2020-01-01 00:12:00+00:00,322.38,322.38,322.38,322.38,2400.0,16.0,322.38,2019-12-31,False
2019-12-31 19:14:00-05:00,SPY,2020-01-01 00:14:00+00:00,322.35,322.35,322.35,322.35,200.0,1.0,322.35,2019-12-31,False
2019-12-31 19:18:00-05:00,SPY,2020-01-01 00:18:00+00:00,322.38,322.38,322.38,322.38,1910.0,28.0,322.38,2019-12-31,False


In [4]:
spy_df_daily = pd.concat([
    spy_df.groupby('us_eastern_date').open.first(),
    spy_df.groupby('us_eastern_date').close.last(),
    spy_df.groupby('us_eastern_date').high.max(),
    spy_df.groupby('us_eastern_date').low.min()], axis=1).reset_index()

In [5]:
spy_df_backup = spy_df.copy()

In [6]:
spy_df = spy_df.drop(columns=['timestamp'])
spy_df

Unnamed: 0_level_0,symbol,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-12-31 19:01:00-05:00,SPY,322.3600,322.36,322.3600,322.3600,1073.0,23.0,322.360000,2019-12-31,False
2019-12-31 19:11:00-05:00,SPY,322.3800,322.38,322.3800,322.3800,300.0,6.0,322.380000,2019-12-31,False
2019-12-31 19:12:00-05:00,SPY,322.3800,322.38,322.3800,322.3800,2400.0,16.0,322.380000,2019-12-31,False
2019-12-31 19:14:00-05:00,SPY,322.3500,322.35,322.3500,322.3500,200.0,1.0,322.350000,2019-12-31,False
2019-12-31 19:18:00-05:00,SPY,322.3800,322.38,322.3800,322.3800,1910.0,28.0,322.380000,2019-12-31,False
...,...,...,...,...,...,...,...,...,...,...
2024-11-07 14:06:00-05:00,SPY,594.7100,594.97,594.5000,594.9300,163155.0,1225.0,594.718549,2024-11-07,False
2024-11-07 14:07:00-05:00,SPY,594.8800,594.94,594.7601,594.8959,103768.0,760.0,594.845416,2024-11-07,False
2024-11-07 14:08:00-05:00,SPY,594.9000,595.15,594.8500,595.0400,63505.0,783.0,594.996543,2024-11-07,False
2024-11-07 14:09:00-05:00,SPY,595.0101,595.04,594.9000,595.0100,100873.0,726.0,594.953314,2024-11-07,False


### features

#### ema, sma, rsi

In [7]:
# ema, sma
 
def calculate_ma(df, ema=True, sma=True, ):
    # minute_windows = [1, 2, 3, 5, 9, 10, 12, 10, 30]  # Minute-level windows
    minute_windows = [x for x in range(1, 30)] + [50, 100, 200, 500]
    hour_windows = [60, 120]  # Hour-level windows, converted to minutes
    day_windows = [1440, 2880, 4320, 7200, 14400, 20160]  # Day-level windows, converted to minutes

    # Combine all windows for processing
    all_windows = minute_windows + hour_windows + day_windows

    # Calculate EMAs and SMAs for each window
    for window in all_windows:
        for price in ['open', 'high', 'low', 'close', 'volume']:
            df[f'{price}_ema_{window}m'] = df[price].ewm(span=window, adjust=False).mean()
            df[f'{price}_sma_{window}m'] = df[price].rolling(window=window).mean()
    return df

In [8]:
# rsi

def calculate_rsi(data, window=14, ewm=True, rm=True, diff_type='close_prev'):
    if diff_type == 'close_prev':
        delta = data.diff()
    elif diff_type == 'close_open':
        delta = data['close'] - data['open']

    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)

    if ewm:
        avg_gain = gain.ewm(span=window, adjust=False).mean()
        avg_loss = loss.ewm(span=window, adjust=False).mean()
    if rm:
        avg_gain = gain.rolling(window=window, min_periods=1).mean()
        avg_loss = loss.rolling(window=window, min_periods=1).mean()
    
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def apply_rsi(df, timeframe=5, ewm=True, rm=True, window=14, diff_type='close_prev'):
    # Adjust how the close price is handled based on the timeframe
    timeframe_label = f'{timeframe}T'
    df[f'close_last_{timeframe_label}'] = df['close'].rolling(window=timeframe).apply(lambda x: x[-1])

    # Select the type of data to pass to the RSI calculation
    if diff_type == 'close_prev':
        data_for_rsi = df[f'close_last_{timeframe_label}']
    elif diff_type == 'close_open':
        data_for_rsi = df[['open', f'close_last_{timeframe_label}']].rename(columns={f'close_last_{timeframe_label}': 'close'})

    # Calculate RSI with the specified average type, window, and difference type
    df[f'{diff_type}_rsi_{window}_{timeframe_label}'] = calculate_rsi(data_for_rsi, window=window, ewm=ewm, rm=rm, diff_type=diff_type)

    return df

# Example of usage
# apply_rsi(spy_df, timeframe=5, ewm=True, rm=True, window=14, diff_type='close_prev')

def get_all_rsis(df):
    timeframes = [1, 2, 3, 5, 10, 30, 60, 120, 1440, 2880, 4320, 7200, 14400, 20160]  # time blocks
    windows = [7, 9, 12, 14, 20, 21, 26, 28, 30, 50] # lookback window
    for timrframe in timeframes:
        for window in windows:
            for diff_type in ['close_prev', 'close_open']:
                df = apply_rsi(df, timeframe=timrframe, ewm=True, rm=True, window=window, diff_type=diff_type)
    return df

In [9]:
def calculate_features(df):
    df = calculate_ma(df)
    df = get_all_rsis(df)
    return df

#### calculations

In [10]:
spy_df = calculate_features(spy_df)

In [11]:
spy_df.tail()

Unnamed: 0_level_0,symbol,open,high,low,close,volume,trade_count,vwap,us_eastern_date,market_open,...,close_prev_rsi_21_20160T,close_open_rsi_21_20160T,close_prev_rsi_26_20160T,close_open_rsi_26_20160T,close_prev_rsi_28_20160T,close_open_rsi_28_20160T,close_prev_rsi_30_20160T,close_open_rsi_30_20160T,close_prev_rsi_50_20160T,close_open_rsi_50_20160T
us_eastern_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-11-07 14:06:00-05:00,SPY,594.71,594.97,594.5,594.93,163155.0,1225.0,594.718549,2024-11-07,False,...,60.877286,62.595151,60.601859,62.690097,53.439287,55.694659,56.102109,58.009791,50.112098,50.40846
2024-11-07 14:07:00-05:00,SPY,594.88,594.94,594.7601,594.8959,103768.0,760.0,594.845416,2024-11-07,False,...,65.404245,68.545697,60.012114,63.444417,52.861462,55.708958,53.432145,56.392331,50.178471,51.165184
2024-11-07 14:08:00-05:00,SPY,594.9,595.15,594.85,595.04,63505.0,783.0,594.996543,2024-11-07,False,...,66.508127,69.631027,63.053063,66.318139,61.974454,64.91988,55.090731,58.041174,53.969514,54.932181
2024-11-07 14:09:00-05:00,SPY,595.0101,595.04,594.9,595.01,100873.0,726.0,594.953314,2024-11-07,False,...,65.204773,69.132224,61.96634,65.982521,61.506829,65.418936,54.605439,57.824291,53.182831,54.404133
2024-11-07 14:10:00-05:00,SPY,595.01,595.04,594.7901,594.7901,80904.0,631.0,594.97934,2024-11-07,False,...,57.047216,60.68525,57.94261,60.92529,57.496032,61.057831,56.597813,59.870607,49.891173,51.201745


In [None]:
[x for x in spy_df.columns if 'close_ema' in x]

In [None]:
spy_df['close_ema_12m'] - spy_df['close_ema_26m']

### Output files

In [13]:
spy_df.to_parquet('spy_df_with_rsi_11_9_2024_19_10.parquet', index=False, partition_cols=['symbol', 'us_eastern_date'])
spy_df.to_parquet('spy_df_with_rsi_11_9_2024_19_10_copy.parquet', index=False, partition_cols=['symbol', 'us_eastern_date'])
spy_df.to_parquet('s3://sisyphus-general-bucket/AthenaInsights/temp_data/spy_df_with_rsi_11_9_2024_19_10.parquet', index=False, partition_cols=['symbol', 'us_eastern_date'])


: 

#### lags

In [None]:
# spy_df[spy_df['open']!=spy_df['open_ema_1m']]
# print('\n'.join([x for x in spy_df.columns if 'open_ema' in x]))
# print('\n'.join([x for x in spy_df.columns if 'open_sma' in x]))
# print('\n'.join([x for x in spy_df.columns if 'rsi' in x]))

In [None]:
# timeframes = [1, 2, 3, 5, 10, 30, 60, 120, 1440, 2880, 4320, 7200, 14400, 20160]  # time blocks
# for timeframe in timeframes:
#     for price in ['open', 'high', 'low', 'close', 'volume']:
#         spy_df[f'{price}_last_{timeframe_label}'] = spy_df[f'{price}'].rolling(window=timeframe).apply(lambda x: x[-1])
#         spy_df[f'{price}_ema_last_{timeframe_label}m'] = spy_df[f'{price}_ema_{window}m'].rolling(window=timeframe).apply(lambda x: x[-1])
#         spy_df[f'{price}_sma_last_{timeframe_label}m'] = spy_df[f'{price}_sma_{window}m'].rolling(window=timeframe).apply(lambda x: x[-1])