#Day 11 of 14 Days Databricks Challenge

In [0]:
from datetime import datetime
from pyspark.sql import functions as F
from pyspark.sql.window import Window

def log(msg):
    print(f"[{datetime.now()}] ðŸ”¹ {msg}")


In [0]:
log("Loading silver_events table")
events = spark.table("default.silver_events")

log("Calculating descriptive statistics for price")
events.select("price").describe().show()


In [0]:
log("Deriving event_date and is_weekend flag")

events_wd = (
    events
    .withColumn("event_date", F.to_date("event_time"))
    .withColumn(
        "is_weekend",
        F.dayofweek("event_time").isin([1, 7])  # Sunday=1, Saturday=7
    )
)

log("Aggregating event counts by weekend and event_type")

events_wd.groupBy("is_weekend", "event_type").count().orderBy(
    "is_weekend", "event_type"
).show()


In [0]:
log("Creating binary purchase flag")

events_corr = events.withColumn(
    "is_purchase",
    F.when(F.col("event_type") == "purchase", 1).otherwise(0)
)

log("Calculating correlation between price and purchase probability")

corr_value = events_corr.stat.corr("price", "is_purchase")
log(f"Correlation (price vs purchase): {corr_value}")


In [0]:
log("Starting feature engineering")

window_user = Window.partitionBy("user_id").orderBy("event_time")

features = (
    events
    .withColumn("event_date", F.to_date("event_time"))
    .withColumn("hour", F.hour("event_time"))
    .withColumn("day_of_week", F.dayofweek("event_time"))
    .withColumn("price_log", F.log(F.col("price") + 1))
    .withColumn(
        "time_since_first_event",
        F.unix_timestamp("event_time") -
        F.unix_timestamp(F.first("event_time").over(window_user))
    )
)

log("Feature engineering completed")
features.select(
    "user_id",
    "hour",
    "day_of_week",
    "price_log",
    "time_since_first_event"
).show(20)
