# Gold Aggregation

This notebook aggregates curated (silver) data into gold-level metrics.


In [None]:
try:
    spark
except NameError:
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.appName("gold-aggregation").getOrCreate()


In [None]:
from pyspark.sql import functions as F

data_path = "../../data/example.csv"

raw = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(data_path)
)

silver = (
    raw
    .withColumn("signup_date", F.to_date("signup_date"))
    .withColumn("spend", F.col("spend").cast("double"))
)


In [None]:
gold_by_plan = (
    silver
    .groupBy("plan")
    .agg(
        F.count("*").alias("users"),
        F.round(F.sum("spend"), 2).alias("total_spend"),
        F.round(F.avg("spend"), 2).alias("avg_spend"),
        F.sum(F.when(F.col("is_active"), 1).otherwise(0)).alias("active_users"),
    )
    .orderBy(F.col("total_spend").desc())
)

gold_by_plan.show(truncate=False)


In [None]:
gold_by_month = (
    silver
    .withColumn("signup_month", F.date_format("signup_date", "yyyy-MM"))
    .groupBy("signup_month")
    .agg(
        F.count("*").alias("users"),
        F.round(F.sum("spend"), 2).alias("total_spend"),
    )
    .orderBy("signup_month")
)

gold_by_month.show(truncate=False)
