In [1]:
import sys
import os

project_root = os.path.abspath("../..")

if project_root not in sys.path:
    sys.path.append(project_root)

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [2]:
from datapipeline.utils.spark_session import get_spark_session

spark = get_spark_session("ML_Event_Detection")

In [3]:
clusters_path = os.path.join(
    project_root,
    "sanewsstorage/ml/clusters_labeled"
)

df = spark.read.format("delta").load(clusters_path)

In [4]:
from pyspark.sql.functions import col, to_date

df = df.select(
    "bronze_hash",
    "cluster_id",
    "cluster_label",
    "published_at"
)

df = df.withColumn(
    "date",
    to_date(col("published_at"))
)

In [5]:
from pyspark.sql.functions import count

daily_counts = (
    df.groupBy("cluster_id", "cluster_label", "date")
    .agg(
        count("bronze_hash").alias("article_count")
    )
)

In [6]:
from pyspark.sql.window import Window
from pyspark.sql.functions import avg, stddev

window_spec = (
    Window
    .partitionBy("cluster_id")
    .orderBy("date")
    .rowsBetween(-7, -1)   # previous 7 days
)

baseline_df = daily_counts.withColumn(
    "baseline_avg",
    avg("article_count").over(window_spec)
).withColumn(
    "baseline_std",
    stddev("article_count").over(window_spec)
)

In [7]:
from pyspark.sql.functions import when

event_df = baseline_df.withColumn(
    "z_score",
    when(
        col("baseline_std").isNull(),
        0
    ).otherwise(
        (col("article_count") - col("baseline_avg")) /
        col("baseline_std")
    )
)

In [8]:
event_df = event_df.withColumn(
    "is_event",
    when(col("z_score") >= 2.0, 1).otherwise(0)
)

In [9]:
event_df = event_df.withColumn(
    "event_intensity",
    when(col("z_score") >= 4, "Viral")
    .when(col("z_score") >= 2, "Trending")
    .otherwise("Normal")
)

In [10]:
from pyspark.sql.functions import current_date, date_sub

recent_events = event_df.filter(
    (col("is_event") == 1) &
    (col("date") >= date_sub(current_date(), 3))
)

In [11]:
event_path = os.path.join(
    project_root,
    "sanewsstorage/ml/events"
)

In [12]:
from delta.tables import DeltaTable

if DeltaTable.isDeltaTable(spark, event_path):

    delta_table = DeltaTable.forPath(
        spark,
        event_path
    )

    (
        delta_table.alias("t")
        .merge(
            event_df.alias("s"),
            "t.cluster_id = s.cluster_id AND t.date = s.date"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )

else:

    (
        event_df.write
        .format("delta")
        .mode("overwrite")
        .save(event_path)
    )


In [13]:
event_df.show(5)

+----------+--------------------+----------+-------------+------------------+------------------+-------------------+--------+---------------+
|cluster_id|       cluster_label|      date|article_count|      baseline_avg|      baseline_std|            z_score|is_event|event_intensity|
+----------+--------------------+----------+-------------+------------------+------------------+-------------------+--------+---------------+
|         0|2026, available, ...|2026-02-02|           58|              NULL|              NULL|                0.0|       0|         Normal|
|         0|2026, available, ...|2026-02-03|          292|              58.0|              NULL|                0.0|       0|         Normal|
|         0|2026, available, ...|2026-02-04|          386|             175.0|165.46298679765212| 1.2752096652167653|       0|         Normal|
|         0|2026, available, ...|2026-02-05|          212|245.33333333333334|168.90628565371193|-0.1973480927860354|       0|         Normal|
|     