# INITIALIZATION

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window

# Business Logic

In [0]:
query = """
WITH customers AS (
  SELECT
    CAST(customer_id AS STRING) AS customer_id_str,
    customer_key,
    first_name,
    last_name,
    marital_status,
    gender,
    created_date
  FROM silver.crm_customers
),
sales AS (
  SELECT
    CAST(customer_id AS STRING) AS customer_id_str,
    order_number,
    sales_amount,
    quantity
  FROM silver.crm_sales
),
loc AS (
  SELECT
    CAST(company_id AS STRING) AS company_id_str,
    country
  FROM silver.erp_customer_location
)

SELECT
  ROW_NUMBER() OVER (ORDER BY c.customer_id_str) AS customer_key,

  -- keep customer_id as STRING so AW00011000 is valid
  c.customer_id_str AS customer_id,

  c.customer_key AS customer_business_key,
  c.first_name,
  c.last_name,
  c.marital_status,
  c.gender,
  c.created_date,

  COALESCE(l.country, 'n/a') AS country,

  COUNT(DISTINCT s.order_number) AS total_orders,
  SUM(COALESCE(s.sales_amount, 0)) AS total_sales,
  SUM(COALESCE(s.quantity, 0)) AS total_units

FROM customers c
LEFT JOIN loc l
  ON c.customer_id_str = l.company_id_str
LEFT JOIN sales s
  ON c.customer_id_str = s.customer_id_str

GROUP BY
  c.customer_id_str,
  c.customer_key,
  c.first_name,
  c.last_name,
  c.marital_status,
  c.gender,
  c.created_date,
  l.country
"""
df = spark.sql(query)



# Write to the Gold Table

In [0]:
df = spark.sql(query)

# Create the table (overwrite is best for first time)
(
    df.write
      .mode("overwrite")
      .format("delta")
      .saveAsTable("gold.dim_customers")
)


In [0]:
%sql
SELECT *
FROM workspace.gold.dim_customers