In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

events = spark.table("workspace.ecommerce.ecommerce_events_delta")

In [0]:
events.printSchema()

### SUMMARY

In [0]:
events.select("price").describe().show()

### Testing hypothesis (Weekday vs Weekend behavior)

In [0]:
events_with_day = events.withColumn(
    "is_weekend",
    F.dayofweek("event_time").isin([1, 7])
)

In [0]:
events_with_day.groupBy("is_weekend", "event_type") \
               .count() \
               .orderBy("is_weekend", "event_type") \
               .show()

In [0]:
events_with_day.filter(F.col("event_type") == "purchase") \
               .groupBy("is_weekend") \
               .count() \
               .show()

### CORRELATION

In [0]:
events_corr = events.withColumn(
    "is_purchase",
    F.when(F.col("event_type") == "purchase", 1).otherwise(0)
)

events_corr.stat.corr("price", "is_purchase")

### Features for ML

Time-based features

In [0]:
features_df = events.withColumn("hour", F.hour("event_time")) \
    .withColumn("day_of_week", F.dayofweek("event_time")) \
    .withColumn("is_weekend", F.dayofweek("event_time").isin([1,7]))

Log-transform price (handle skew)

In [0]:
features_df = features_df.withColumn(
    "price_log",F.log(F.col("price") + 1))

User behavior feature

In [0]:
user_window = Window.partitionBy("user_id").orderBy("event_time")

features_df = features_df.withColumn(
    "time_since_first_event",
    F.unix_timestamp("event_time") -
    F.unix_timestamp(F.first("event_time").over(user_window))
)