In [0]:
# 03_gold_analytics

from pyspark.sql import functions as F

# Read Silver
silver = spark.table("etl_demo.silver_retail_sales")

print("Loaded Silver rows:", silver.count())
print("Silver columns:", silver.columns)

# ---------------------------------------
# 1. GOLD: DAILY REVENUE & ORDER VOLUME
# ---------------------------------------

spark.sql("""
CREATE OR REPLACE TABLE etl_demo.gold_daily_revenue AS
SELECT
  CAST(order_date AS DATE) AS date,
  year,
  month,
  COUNT(DISTINCT order_id) AS num_orders,
  SUM(quantity * unit_price) AS revenue_ex_tax,
  SUM(tax) AS tax_collected,
  SUM(total_amount_final) AS revenue_incl_tax
FROM etl_demo.silver_retail_sales
GROUP BY order_date, year, month
ORDER BY date
""")

print("✔ Created: etl_demo.gold_daily_revenue")
display(spark.table("etl_demo.gold_daily_revenue").limit(10))

# ---------------------------------------
# 2. GOLD: TOP PRODUCTS BY REVENUE
# ---------------------------------------

spark.sql("""
CREATE OR REPLACE TABLE etl_demo.gold_top_products AS
SELECT
  product_id,
  SUM(quantity) AS quantity_sold,
  SUM(quantity * unit_price) AS revenue_ex_tax,
  SUM(total_amount_final) AS revenue_incl_tax,
  COUNT(DISTINCT order_id) AS num_orders
FROM etl_demo.silver_retail_sales
GROUP BY product_id
ORDER BY revenue_ex_tax DESC
""")

print("✔ Created: etl_demo.gold_top_products")
display(spark.table("etl_demo.gold_top_products").limit(10))

# ---------------------------------------
# 3. GOLD: CUSTOMER PROFITABILITY (optional)
# ---------------------------------------

spark.sql("""
CREATE OR REPLACE TABLE etl_demo.gold_customer_value AS
SELECT
  customer_id,
  COUNT(DISTINCT order_id) AS orders_count,
  SUM(total_amount_final) AS total_spent,
  AVG(total_amount_final) AS avg_order_value,
  MAX(total_amount_final) AS max_order_value
FROM etl_demo.silver_retail_sales
GROUP BY customer_id
ORDER BY total_spent DESC
""")

print("✔ Created: etl_demo.gold_customer_value")
display(spark.table("etl_demo.gold_customer_value").limit(10))
