### Gold Layer Analytics

In [2]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = SparkSession.builder.appName("Bronze Ingestion") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("WARN")
spark.conf.set("spark.sql.shuffle.partitions", "4")

### Loading Silver Dataset

Read curated Silver Delta table as the trusted input for analytical processing.
These dataset is clean, standardized, and optimized for aggregation.

In [3]:
silver_orders = spark.read.format("delta").load("../delta/02_silver/orders_enriched")

In [4]:
silver_orders.printSchema()
silver_orders.count()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- total_price: double (nullable = true)
 |-- profit_margin: double (nullable = true)
 |-- delivery_time_days: integer (nullable = true)
 |-- payment_count: long (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)



26/01/12 01:25:15 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

112650

### Time-Based Analysis

Aggregate order and revenue metrics by year, month, and day.
This enables trend analysis and time-series reporting.

In [5]:
gold_daily_sales = (
    silver_orders
        .groupBy("year", "month", "day")
        .agg(
            F.countDistinct("order_id").alias("total_orders"),
            F.sum("total_price").alias("total_revenue"),
            F.sum("profit_margin").alias("total_profit"),
            F.avg("delivery_time_days").alias("avg_delivery_days")
        )
        .orderBy("year", "month", "day")
)

In [6]:
gold_daily_sales.show(5)



+----+-----+---+------------+-----------------+------------------+-----------------+
|year|month|day|total_orders|    total_revenue|      total_profit|avg_delivery_days|
+----+-----+---+------------+-----------------+------------------+-----------------+
|2016|    9|  4|           1|           136.23| 9.549999999999997|             NULL|
|2016|    9|  5|           1|            75.06|             43.94|             NULL|
|2016|    9| 15|           1|           143.46|126.48000000000002|             55.0|
|2016|   10|  2|           1|           109.34|             90.66|             NULL|
|2016|   10|  3|           8|595.1400000000001|331.82000000000005|           26.625|
+----+-----+---+------------+-----------------+------------------+-----------------+
only showing top 5 rows


                                                                                

In [7]:
(
    gold_daily_sales.write
        .format("delta")
        .mode("overwrite")
        .partitionBy("year", "month")
        .save("../delta/03_gold/daily_sales")
)

                                                                                

### Revenue and Profitability KPIs

Compute key financial indicators such as total revenue, freight costs, and profit margins.
These metrics support business performance monitoring.

In [8]:
gold_seller_performance = (
    silver_orders
        .groupBy("seller_id")
        .agg(
            F.countDistinct("order_id").alias("orders_count"),
            F.sum("total_price").alias("seller_revenue"),
            F.avg("delivery_time_days").alias("avg_delivery_days")
        )
)

In [9]:
(
    gold_seller_performance.write
        .format("delta")
        .mode("overwrite")
        .save("../delta/03_gold/seller_performance")
)

                                                                                

### Product and Seller Performance

Aggregate sales, revenue, and profitability metrics at the product and seller level.
These insights are used to evaluate performance and identify optimization opportunities.

In [10]:
gold_product_performance = (
    silver_orders
        .groupBy("product_id")
        .agg(
            F.count("order_id").alias("units_sold"),
            F.sum("total_price").alias("product_revenue"),
            F.sum("profit_margin").alias("product_profit")
        )
)

In [11]:
(
    gold_product_performance.write
        .format("delta")
        .mode("overwrite")
        .save("../delta/03_gold/product_performance")
)

                                                                                

### Customer Analytics

Generate customer-centric metrics including order counts, spending behavior, and lifecycle indicators.
This dataset supports segmentation and retention analysis.

In [12]:
gold_customer_behavior = (
    silver_orders
        .groupBy("customer_id")
        .agg(
            F.countDistinct("order_id").alias("orders_count"),
            F.sum("total_price").alias("total_spent"),
            F.avg("total_price").alias("avg_order_value")
        )
)

In [13]:
(
    gold_customer_behavior.write
        .format("delta")
        .mode("overwrite")
        .save("../delta/03_gold/customer_behavior")
)

                                                                                

In [19]:
spark.read.format("delta").load("../delta/03_gold/daily_sales").show(5)

+----+-----+---+------------+------------------+------------------+------------------+
|year|month|day|total_orders|     total_revenue|      total_profit| avg_delivery_days|
+----+-----+---+------------+------------------+------------------+------------------+
|2018|    6|  1|         184|          37282.67|27315.170000000002|12.018957345971565|
|2018|    6|  2|         142|28939.989999999998|          21593.25|12.089171974522293|
|2018|    6|  3|         191|           32869.9|          24053.42| 9.088785046728972|
|2018|    6|  4|         225|           34326.4|22918.399999999998| 8.945098039215686|
|2018|    6|  5|         197|          36101.72|           25317.0| 9.635983263598327|
+----+-----+---+------------+------------------+------------------+------------------+
only showing top 5 rows


In [16]:
spark.read.format("delta").load("../delta/03_gold/seller_performance").count()

3095

In [17]:
spark.read.format("delta").load("../delta/03_gold/product_performance").count()

32951

In [18]:
spark.read.format("delta").load("../delta/03_gold/customer_behavior").count()

98666