In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

events = spark.table("ecommerce.silver.events")


In [0]:
# Basic descriptive statistics
events.select("price").describe().show()

# Extended stats
events.agg(
    F.count("*").alias("rows"),
    F.avg("price").alias("avg_price"),
    F.expr("percentile(price, 0.5)").alias("median_price"),
    F.stddev("price").alias("std_price"),
    F.min("price").alias("min_price"),
    F.max("price").alias("max_price")
).show()


+-------+-----+
|summary|price|
+-------+-----+
|  count|    0|
|   mean| NULL|
| stddev| NULL|
|    min| NULL|
|    max| NULL|
+-------+-----+

+----+---------+------------+---------+---------+---------+
|rows|avg_price|median_price|std_price|min_price|max_price|
+----+---------+------------+---------+---------+---------+
|   0|     NULL|        NULL|     NULL|     NULL|     NULL|
+----+---------+------------+---------+---------+---------+



In [0]:
events_with_flag = events.withColumn(
    "is_weekend",
    F.dayofweek("event_date").isin([1, 7])  # Sunday=1, Saturday=7
)


In [0]:
events_with_flag.groupBy("is_weekend", "event_type") \
    .count() \
    .orderBy("is_weekend", "event_type") \
    .show()


+----------+----------+-----+
|is_weekend|event_type|count|
+----------+----------+-----+
+----------+----------+-----+



In [0]:
conversion = events_with_flag.groupBy("is_weekend").agg(
    F.sum(F.when(F.col("event_type") == "view", 1).otherwise(0)).alias("views"),
    F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchases")
).withColumn(
    "conversion_rate",
    F.round(F.col("purchases") * 100 / F.col("views"), 2)
)

conversion.show()


+----------+-----+---------+---------------+
|is_weekend|views|purchases|conversion_rate|
+----------+-----+---------+---------------+
+----------+-----+---------+---------------+



In [0]:
corr_df = events.withColumn(
    "is_purchase",
    F.when(F.col("event_type") == "purchase", 1).otherwise(0)
)

corr_df.stat.corr("price", "is_purchase")


nan

In [0]:
user_metrics = events.groupBy("user_id").agg(
    F.count("*").alias("event_count"),
    F.sum(F.when(F.col("event_type") == "purchase", F.col("price")).otherwise(0)).alias("total_spent")
)

user_metrics.stat.corr("event_count", "total_spent")


nan

In [0]:
window_user = Window.partitionBy("user_id").orderBy("event_time")

features_df = events.withColumn("hour", F.hour("event_time")) \
    .withColumn("day_of_week", F.dayofweek("event_date")) \
    .withColumn("is_weekend", F.dayofweek("event_date").isin([1,7]).cast("int")) \
    .withColumn("price_log", F.log1p("price")) \
    .withColumn("event_ts", F.unix_timestamp("event_time")) \
    .withColumn("first_event_ts", F.first("event_ts").over(window_user)) \
    .withColumn("time_since_first_event",
                F.col("event_ts") - F.col("first_event_ts")) \
    .withColumn("label",
                F.when(F.col("event_type") == "purchase", 1).otherwise(0))


In [0]:
features_df.select(
    "user_id",
    "hour",
    "day_of_week",
    "is_weekend",
    "price_log",
    "time_since_first_event",
    "label"
).show(5)


+-------+----+-----------+----------+---------+----------------------+-----+
|user_id|hour|day_of_week|is_weekend|price_log|time_since_first_event|label|
+-------+----+-----------+----------+---------+----------------------+-----+
+-------+----+-----------+----------+---------+----------------------+-----+

