In [1]:
import boto3
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

### data

In [2]:
client = boto3.client('s3')

In [3]:
s3_prefix = 's3://'
bucket = 'sisyphus-general-bucket'
primary_folder = 'AthenaInsights'

In [4]:
# spy_1m_df_loc = f'{s3_prefix}{bucket}/{primary_folder}/data/data_prep/stock_bars_1min.parquet'
spy_1m_df_loc = 's3://sisyphus-general-bucket/AthenaInsights/data/feature_prep/stock_bars_1min.parquet'
spy_1m_df = pd.read_parquet(spy_1m_df_loc)
spy_1m_df = spy_1m_df[spy_1m_df.symbol=='SPY']

In [5]:
# Check if the Datetime index is sorted
if spy_1m_df.index.is_monotonic_increasing:
    print("The index is sorted.")
else:
    print("The index is not sorted. Sorting now.")
    spy_1m_df.sort_index(inplace=True)


The index is sorted.


### Variable creation

In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

def calculate_trend_slope(df, window=20, field='close'):
    """ Calculate the slope of the linear regression line for the last 'window' minutes based on a specified field """
    reg = LinearRegression()
    # Indices for X, specified field values for Y
    x = np.array(range(window)).reshape(-1, 1)
    y = df[field].values.reshape(-1, 1)
    reg.fit(x, y)
    # Slope of the regression line
    return reg.coef_[0][0]

def categorize_points(df, field='close', prev_data_points=20, positive_slope_threshold=0.0, negative_slope_threshold=0.0, positive_rise_threshold=0.0003, negative_drop_threshold=0.0003, positive_future_window=30, negative_future_window=30):
    """ Categorize each minute data point into A, B, or C with dynamic thresholds and fields """
    categories = []
    future_highs = []
    future_lows = []

    for i in range(len(df)):
        if i < prev_data_points or i > len(df) - max(positive_future_window, negative_future_window):  # Not enough data to categorize
            categories.append('C')  # Consider as undecided for now
            future_highs.append(np.nan)
            future_lows.append(np.nan)
            continue
        
        # Calculate the trend over the past 20 minutes using the specified field
        past_trend_slope = calculate_trend_slope(df.iloc[i-prev_data_points:i], window=prev_data_points,field=field)
        
        # Get the current price and future high/low based on the specified field
        current_price = df.iloc[i][field]
        future_high = df.iloc[i+1:i+1+positive_future_window][field].max()
        future_low = df.iloc[i+1:i+1+negative_future_window][field].min()
        future_highs.append(future_high)
        future_lows.append(future_low)
        
        # Calculate thresholds based on current price
        high_threshold = current_price * (1 + positive_rise_threshold)
        low_threshold = current_price * (1 - negative_drop_threshold)
        
        # Determine the category based on the criteria and trend
        if past_trend_slope < negative_slope_threshold and future_high > high_threshold:
            categories.append('A')
        elif past_trend_slope > positive_slope_threshold and future_low < low_threshold:
            categories.append('B')
        else:
            categories.append('C')
    
    return categories, future_highs, future_lows

def plot_categorization(df, date, field='close', ):
    """ Plot categorization for a given day with dynamic field selection """
    df_day = df.loc[date]
    # categories, future_highs, future_lows = categorize_points(df_day, field=field, **kwargs)
    # df_day['category'] = categories
    # df_day['future_highs'] = future_highs
    # df_day['future_lows'] = future_lows

    
    plt.figure(figsize=(14, 7))
    plt.plot(df_day.index, df_day[field], label=f'{field.capitalize()} Price', color='gray', linewidth=2)
    # plt.plot(df_day.index, df_day['close'], label=f'{field.capitalize()} Price', color='blue', linewidth=1)
    for cat, color in zip(['A', 'B', 'C'], ['green', 'red', 'gray']):
        plt.scatter(df_day[df_day['category'] == cat].index, df_day[df_day['category'] == cat][field], color=color, label=f'Category {cat}', s=20 if cat!='C' else 0)
    plt.legend()
    plt.title(f'Price Categorization on {date}')
    plt.xlabel('Timestamp')
    plt.ylabel(f'{field.capitalize()} Price')
    plt.show()

In [7]:
dates = spy_1m_df.us_eastern_date.unique()

In [8]:
spy_1m_df['category'] = ''
spy_1m_df['future_highs'] = np.nan
spy_1m_df['future_lows'] = np.nan
for date in dates:
    df_day = spy_1m_df[spy_1m_df.us_eastern_date==date]
    categories, future_highs, future_lows = categorize_points(df_day, field='close_sma_5m', prev_data_points=11,
                                                              positive_slope_threshold=0.013, negative_slope_threshold=-0.026, 
                                                              positive_rise_threshold=0.0009, negative_drop_threshold=0.0018, 
                                                              positive_future_window=30, negative_future_window=30)
                                                              
    spy_1m_df.loc[spy_1m_df.us_eastern_date==date, 'category'] = categories
    spy_1m_df.loc[spy_1m_df.us_eastern_date==date, 'future_highs'] = future_highs
    spy_1m_df.loc[spy_1m_df.us_eastern_date==date, 'future_lows'] = future_lows
    del categories, future_highs, future_lows, df_day


In [9]:
spy_1m_df[['open', 'high', 'low', 'close', 'us_eastern_date', 'close_sma_5m', 'category', 'future_highs', 'future_lows']].to_parquet('s3://sisyphus-general-bucket/AthenaInsights/data/dependent_variable/stock_bars_1min.parquet')

In [10]:


def plot_categorization(df, date, field='close', ):
    """ Plot categorization for a given day with dynamic field selection """
    df_day = df.loc[date]
    # categories, future_highs, future_lows = categorize_points(df_day, field=field, **kwargs)
    # df_day['category'] = categories
    # df_day['future_highs'] = future_highs
    # df_day['future_lows'] = future_lows

    
    plt.figure(figsize=(14, 7))
    plt.plot(df_day.index, df_day[field], label=f'{field.capitalize()} Price', color='gray', linewidth=2)
    # plt.plot(df_day.index, df_day['close'], label=f'{field.capitalize()} Price', color='blue', linewidth=1)
    for cat, color in zip(['A', 'B', 'C'], ['green', 'red', 'gray']):
        plt.scatter(df_day[df_day['category'] == cat].index, df_day[df_day['category'] == cat][field], color=color, label=f'Category {cat}', s=20 if cat!='C' else 0)
    plt.legend()
    plt.title(f'Price Categorization on {date}')
    plt.xlabel('Timestamp')
    plt.ylabel(f'{field.capitalize()} Price')
    plt.show()