In [0]:
%run /Workspace/Users/jorgegarciaotero@gmail.com/config/database_connector

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag, avg, stddev, when, log

In [0]:
def add_basic_stock_metrics(df):
    """
    Adds daily financial and price-based indicators to stock data.
    New columns added:
    - prev_close: Closing price from the previous trading day (per symbol).
    - prev_volume: Volume from the previous trading day.
    - daily_return: Daily return, calculated as (close - open) / open.
    - close_change_pct: Percentage change of close price vs previous day.
    - intraday_volatility: Daily volatility, calculated as (high - low) / open.
    - price_range: Absolute daily price range (high - low).
    - gap_open: Difference between today's open and previous close as a % of previous close.
    - log_return: Natural logarithm of return between close and previous close.
    - volume_change_pct: Volume change vs previous day, as percentage.
    - is_dividend_day: Binary indicator (1 if dividend > 0, else 0).
    - is_stock_split: Binary indicator (1 if stock_splits > 0, else 0).

    Args:
        df_stock_data (DataFrame): Spark DataFrame with historical stock prices and structure similar to:
            ['date', 'symbol', 'open_v', 'high', 'low', 'close_v', 'volume', 'dividends', 'stock_splits']

    Returns:
        DataFrame: Enriched Spark DataFrame with additional technical and financial columns.
    """
    window_spec = Window.partitionBy("symbol").orderBy("date")

    df = df \
    .withColumn("prev_close", lag("close_v", 1).over(window_spec)) \
    .withColumn("prev_volume", lag("volume", 1).over(window_spec)) \
    .withColumn("daily_return", (col("close_v") - col("open_v")) / col("open_v")) \
    .withColumn("close_change_pct", ((col("close_v") - col("prev_close")) / col("prev_close"))) \
    .withColumn("intraday_volatility", (col("high") - col("low")) / col("open_v")) \
    .withColumn("price_range", col("high") - col("low")) \
    .withColumn("gap_open", (col("open_v") - col("prev_close")) / col("prev_close")) \
    .withColumn("log_return", log(col("close_v") / col("prev_close"))) \
    .withColumn("volume_change_pct", (col("volume") - col("prev_volume")) / col("prev_volume")) \
    .withColumn("is_dividend_day", when(col("dividends") > 0, 1).otherwise(0)) \
    .withColumn("is_stock_split", when(col("stock_splits") > 0, 1).otherwise(0))
    df.show()
    return df 


In [0]:
def add_technical_stock_metrics(df):
    """
    Adds technical indicators to stock price data.

    Includes:
    - SMA 5 and 20 (simple moving averages)
    - RSI 14 (Relative Strength Index)
    - Bollinger Bands (upper/lower based on 20-day SMA and StdDev)
    - Relative volume (vs 5-day average)

    Args:
        df (DataFrame): Spark DataFrame with stock price data, including at least:
            ['date', 'symbol', 'open_v', 'close_v', 'high', 'low', 'volume']

    Returns:
        DataFrame: Spark DataFrame with new technical indicator columns.
    """

    window_spec = Window.partitionBy("symbol").orderBy("date")
    
    # SMA 5 y 20
    df = df.withColumn("sma_5", avg("close_v").over(window_spec.rowsBetween(-4, 0)))
    df = df.withColumn("sma_20", avg("close_v").over(window_spec.rowsBetween(-19, 0)))

    # RSI 14
    delta = col("close_v") - lag("close_v", 1).over(window_spec)
    gain = when(delta > 0, delta).otherwise(0)
    loss = when(delta < 0, -delta).otherwise(0)

    df = df.withColumn("delta", delta)
    df = df.withColumn("gain", gain)
    df = df.withColumn("loss", loss)

    rsi_window = window_spec.rowsBetween(-13, 0)
    avg_gain = avg("gain").over(rsi_window)
    avg_loss = avg("loss").over(rsi_window)
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    df = df.withColumn("rsi_14", rsi)

    # Bandas de Bollinger
    sma_20 = avg("close_v").over(window_spec.rowsBetween(-19, 0))
    std_20 = stddev("close_v").over(window_spec.rowsBetween(-19, 0))

    df = df.withColumn("bollinger_upper", sma_20 + 2 * std_20)
    df = df.withColumn("bollinger_lower", sma_20 - 2 * std_20)

    # Volumen relativo
    vol_avg_5 = avg("volume").over(window_spec.rowsBetween(-4, 0))
    df = df.withColumn("rel_volume", col("volume") / vol_avg_5)

    return df

In [0]:
if __name__ == '__main__':
    dbutils.widgets.removeAll()

    # Creates the input widgets and sets the default values
    dbutils.widgets.text("storage_account", "smartwalletjorge", "Storage Account")
    dbutils.widgets.text("container", "smart-wallet-dl", "Container")
    dbutils.widgets.text("database", "smart_wallet", "Database")
    dbutils.widgets.text("date", "2024-04-10", "Date")

    storage_account = dbutils.widgets.get("storage_account")
    container = dbutils.widgets.get("container")
    database_name = dbutils.widgets.get("database")
    date_value = dbutils.widgets.get("date")
    if (date_value is None) or (date_value==''):
        date_value=None


    db_connector = DatabaseConnector()
    df_stock_data = db_connector.read_table_from_sql("stock_data",date_value)
    df_stock_data = add_basic_stock_metrics(df_stock_data)
    df_stock_data = add_technical_stock_metrics(df_stock_data)

    

In [0]:
'''db_connector.save_table( df,container,database_name, storage_account,table_name,date_value)
df2=db_connector.read_table_from_path(container, database_name, "stock_data", date_value)
display(df2)'''