# Databricks — Alert → Outcome Analysis (Sydney time)

Goal: quantify which alerts (by `signal`, `source_tf`) are predictive for forward returns / momentum continuation.

This notebook:
- maps each alert to the **next OHLC bar** (to avoid lookahead)
- computes forward returns at multiple horizons
- computes MAE/MFE over a window
- computes ATR-normalized outcomes (move in ATRs)
- aggregates results by signal, timeframe, and Sydney time buckets (hour/dow)

Assumptions:
- Tables exist in `workspace.squeeze`: `alerts`, `ohlc`, and `clean_universe`
- OHLC timestamps are epoch ms in `open_time`


In [None]:
%sql
USE CATALOG `workspace`;
USE SCHEMA `squeeze`;


In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

TZ = 'Australia/Sydney'

# Horizons in bars for forward returns (tune per interval)
HORIZONS = [1, 4, 12, 24, 48]  # for 1h bars these are 1h,4h,12h,1d,2d
MAE_MFE_WINDOW = 48

# Alert de-duplication window (ms). Many systems fire repeated alerts close together.
DEDUP_WINDOW_MS = 60 * 60 * 1000  # 1 hour


## Build an alert dataset aligned to OHLC bars
We join alerts to OHLC by `(exchange, symbol, interval)` and map each alert to the next bar open_time.

In [None]:
# Choose interval to analyze first. Start with 1h (recommended)
INTERVAL = '1h'

univ = (spark.table('clean_universe')
  .where(F.col('interval') == INTERVAL)
  .select('exchange','symbol','interval')
)

alerts0 = (spark.table('alerts')
  .select('exchange','symbol','signal','source_tf','ts','created_ts')
  .withColumn('ts_utc', F.to_timestamp(F.col('ts')/1000))
  .withColumn('ts_syd', F.from_utc_timestamp(F.col('ts_utc'), TZ))
  .withColumn('hour_syd', F.hour('ts_syd'))
  .withColumn('dow_syd', F.date_format('ts_syd', 'E'))
)

# De-dupe alerts within a time bucket per (exchange,symbol,signal,source_tf)
alerts1 = (alerts0
  .withColumn('dedup_bucket', (F.col('ts')/F.lit(DEDUP_WINDOW_MS)).cast('bigint'))
  .withColumn('rn', F.row_number().over(Window.partitionBy('exchange','symbol','signal','source_tf','dedup_bucket').orderBy('ts')))
  .where(F.col('rn') == 1)
  .drop('rn')
)

ohlc0 = (spark.table('ohlc')
  .where(F.col('interval') == INTERVAL)
  .select('exchange','symbol','interval','open_time','open','high','low','close','volume')
  .withColumn('open_dt_utc', F.to_timestamp(F.col('open_time')/1000))
)

# Keep only clean symbols
ohlc1 = ohlc0.join(univ, on=['exchange','symbol','interval'], how='inner')

# Map each alert to the next bar open_time (anti-lookahead)
# Approach: join on same symbol and select the minimum open_time >= ts
# This uses a range join followed by min; for very large data you may want a more optimized approach.
alert_to_bar = (alerts1
  .join(ohlc1, on=['exchange','symbol'], how='inner')
  .where(F.col('open_time') >= F.col('ts'))
  .groupBy('exchange','symbol','signal','source_tf','ts','created_ts','ts_utc','ts_syd','hour_syd','dow_syd')
  .agg(F.min('open_time').alias('entry_open_time'))
)

# Join back to OHLC row for entry bar prices
ohlc_entry = (ohlc1
  .select('exchange','symbol','open_time','open','high','low','close','volume','open_dt_utc')
  .withColumnRenamed('open_time','entry_open_time')
  .withColumnRenamed('open','entry_open')
  .withColumnRenamed('high','entry_high')
  .withColumnRenamed('low','entry_low')
  .withColumnRenamed('close','entry_close')
  .withColumnRenamed('volume','entry_volume')
  .withColumnRenamed('open_dt_utc','entry_dt_utc')
)

entry = (alert_to_bar
  .join(ohlc_entry, on=['exchange','symbol','entry_open_time'], how='inner')
  .withColumn('entry_dt_syd', F.from_utc_timestamp(F.col('entry_dt_utc'), TZ))
)

display(entry.limit(20))
print('Aligned alerts:', entry.count())


## Add forward return horizons + MAE/MFE
We’ll compute forward returns relative to `entry_open` at various horizons, plus max favorable/adverse excursion within a window.

In [None]:
# Create an OHLC table with a bar index per symbol for horizon lookup
w = Window.partitionBy('exchange','symbol','interval').orderBy('open_time')
ohlc_idx = (ohlc1
  .withColumn('bar_idx', F.row_number().over(w) - 1)
  .select('exchange','symbol','interval','open_time','bar_idx','open','high','low','close')
)

entry_idx = (entry
  .join(ohlc_idx.select('exchange','symbol','interval','open_time','bar_idx'),
        (entry.exchange==ohlc_idx.exchange) & (entry.symbol==ohlc_idx.symbol) & (F.lit(INTERVAL)==ohlc_idx.interval) & (entry.entry_open_time==ohlc_idx.open_time),
        how='inner')
  .withColumnRenamed('bar_idx', 'entry_bar_idx')
)

# Forward returns at horizons: join to (entry_bar_idx + h)
out = entry_idx
for h in HORIZONS:
    tgt = (ohlc_idx
      .select('exchange','symbol','interval', F.col('bar_idx').alias(f'bar_idx_h{h}'), F.col('close').alias(f'close_h{h}'))
    )
    out = (out
      .join(tgt,
            (out.exchange==tgt.exchange) & (out.symbol==tgt.symbol) & (F.lit(INTERVAL)==tgt.interval) & ((out.entry_bar_idx + F.lit(h))==F.col(f'bar_idx_h{h}')),
            how='left')
      .withColumn(f'ret_h{h}', (F.col(f'close_h{h}')/F.col('entry_open')) - F.lit(1.0))
    )

# MAE/MFE over a window of bars after entry
# Join bars in [entry_idx, entry_idx+W] and aggregate max high / min low
bars = (out
  .select('exchange','symbol','interval','signal','source_tf','ts','entry_bar_idx','entry_open','hour_syd','dow_syd')
  .join(ohlc_idx, on=['exchange','symbol','interval'], how='inner')
  .where((F.col('bar_idx') >= F.col('entry_bar_idx')) & (F.col('bar_idx') <= (F.col('entry_bar_idx') + F.lit(MAE_MFE_WINDOW))))
)

mae_mfe = (bars
  .groupBy('exchange','symbol','interval','signal','source_tf','ts','entry_bar_idx','entry_open','hour_syd','dow_syd')
  .agg(
    F.max('high').alias('max_high_w'),
    F.min('low').alias('min_low_w')
  )
  .withColumn('mfe', (F.col('max_high_w')/F.col('entry_open')) - 1.0)
  .withColumn('mae', (F.col('min_low_w')/F.col('entry_open')) - 1.0)
)

out2 = (out
  .join(mae_mfe, on=['exchange','symbol','interval','signal','source_tf','ts','entry_bar_idx','entry_open','hour_syd','dow_syd'], how='left')
)

display(out2.select('exchange','symbol','signal','source_tf','entry_open','ret_h1','ret_h4','ret_h24','mfe','mae','hour_syd','dow_syd').limit(50))


## Aggregate: win rates and distribution summaries
We’ll compute hit-rate for positive returns and quantiles by signal/timeframe and Sydney hour/dow.

In [None]:
def agg_for_h(h: int):
    return [
        F.count('*').alias('n'),
        F.avg(F.when(F.col(f'ret_h{h}') > 0, 1.0).otherwise(0.0)).alias('hit_rate_pos'),
        F.avg(F.col(f'ret_h{h}')).alias('avg_ret'),
        F.expr(f'percentile_approx(ret_h{h}, 0.5)').alias('median_ret'),
        F.expr(f'percentile_approx(ret_h{h}, 0.1)').alias('p10_ret'),
        F.expr(f'percentile_approx(ret_h{h}, 0.9)').alias('p90_ret'),
    ]

# Example: horizon 24 bars
H = 24
agg = (out2
  .groupBy('signal','source_tf')
  .agg(*agg_for_h(H),
       F.avg('mfe').alias('avg_mfe'),
       F.avg('mae').alias('avg_mae'))
  .orderBy(F.col('avg_ret').desc())
)
display(agg.limit(200))

# Session effects (Sydney)
agg_sess = (out2
  .groupBy('signal','source_tf','dow_syd','hour_syd')
  .agg(*agg_for_h(H))
  .where(F.col('n') >= 50)
  .orderBy(F.col('avg_ret').desc())
)
display(agg_sess.limit(200))
