# Gold Layer (Sprint 2) – Dimension Development (dim_*)

## Purpose
This notebook builds the Gold **dimension tables** for Sprint 2’s star schema.
These dimensions are designed to support BI joins and filtering while keeping the fact table at the correct grain.

## Gold Layer Guardrails (Sprint 2)

- Gold notebooks must read **promoted Silver tables only** via:
  - `silver_shortcut.sl_*`
- Gold notebooks must **NOT** read from:
  - `sl_dev_*`
- Gold notebooks must **NOT** write back to Silver.

This guardrail ensures Gold development and validation are based on
stable, promoted Silver data only, even though `sl_dev_*` tables may
still be visible via schema shortcuts during development.

## Inputs (Read-only)
- Source of truth is **promoted Silver tables only** (NOT sl_dev_*).
- Read from Silver via OneLake shortcut:
  - `silver_shortcut.sl_orders`
  - `silver_shortcut.sl_sellers`
  - (and other required `sl_*` tables as needed)

## Outputs (Written to Gold)
- `dim_date`
- `dim_seller`

## Expected Grain
- `dim_date`: 1 row per `order_date` (date key)
- `dim_seller`: 1 row per `seller_id`

## Rules / Guardrails
- Do not write to Silver.
- Do not read from `sl_dev_*`.
- Do not perform fact-level aggregations here.
- Dimensions should be stable and reusable by multiple facts.

## Completion Criteria
- `dim_date` and `dim_seller` are created successfully in Gold.
- Keys are unique (no duplicates).
- Columns required by BI are present and well-named.

## Handoff
Once completed, Janson will use `nb_03_gold_star_dev_validation` to validate:
- dimension uniqueness
- join behavior with fact (no exploding joins)
- BI readiness


In [1]:
%%sql
-- =====================================================
-- GOLD LAYER - DIMENSION TABLES
-- =====================================================

-- =====================================================
-- DIM_DATE
-- =====================================================

CREATE OR REPLACE TABLE dim_date
USING DELTA
AS
SELECT 
    CAST(date_format(date_value, 'yyyyMMdd') AS INT) AS date_key,
    date_value AS date,
    year(date_value) AS year,
    quarter(date_value) AS quarter,
    concat('Q', quarter(date_value)) AS quarter_name,
    month(date_value) AS month,
    date_format(date_value, 'MMMM') AS month_name,
    weekofyear(date_value) AS week_of_year,
    dayofmonth(date_value) AS day_of_month,
    dayofweek(date_value) AS day_of_week,
    date_format(date_value, 'EEEE') AS day_name,
    CASE WHEN dayofweek(date_value) IN (1, 7) THEN true ELSE false END AS is_weekend,
    false AS is_holiday,
    year(date_value) AS fiscal_year,
    quarter(date_value) AS fiscal_quarter,
    current_timestamp() AS row_insert_timestamp,
    current_timestamp() AS row_update_timestamp
FROM (
    SELECT date_add('2016-01-01', seq) AS date_value
    FROM (SELECT explode(sequence(0, 5474)) AS seq)
)
WHERE date_value <= '2030-12-31';


-- =====================================================
-- DIM_CUSTOMER
-- =====================================================

CREATE OR REPLACE TABLE dim_customer
USING DELTA
AS
WITH customer_base AS (
    SELECT DISTINCT
        customer_id,
        customer_unique_id,
        customer_zip_code_prefix,
        customer_city,
        customer_state
    FROM silver_shortcut.sl_customers
),
customer_geo AS (
    SELECT 
        c.customer_id,
        c.customer_unique_id,
        c.customer_zip_code_prefix,
        c.customer_city,
        c.customer_state,
        g.geolocation_lat,
        g.geolocation_lng,
        ROW_NUMBER() OVER (PARTITION BY c.customer_id ORDER BY g.geolocation_lat) AS rn
    FROM customer_base c
    LEFT JOIN silver_shortcut.sl_geolocation g
        ON c.customer_zip_code_prefix = g.geolocation_zip_code_prefix
        AND c.customer_city = g.geolocation_city
        AND c.customer_state = g.geolocation_state
)
SELECT 
    row_number() OVER (ORDER BY customer_id) AS customer_key,
    customer_id,
    customer_unique_id,
    customer_zip_code_prefix,
    customer_city,
    customer_state,
    geolocation_lat AS customer_geolocation_lat,
    geolocation_lng AS customer_geolocation_lng,
    current_timestamp() AS row_insert_timestamp,
    current_timestamp() AS row_update_timestamp
FROM customer_geo
WHERE rn = 1;


-- =====================================================
-- DIM_PRODUCT
-- =====================================================

CREATE OR REPLACE TABLE dim_product
USING DELTA
AS
SELECT 
    row_number() OVER (ORDER BY p.product_id) AS product_key,
    p.product_id,
    p.product_category_name,
    coalesce(t.product_category_name_english, p.product_category_name) AS product_category_name_english,
    p.product_name_lenght AS product_name_length,
    p.product_description_lenght AS product_description_length,
    p.product_photos_qty,
    p.product_weight_g,
    p.product_length_cm,
    p.product_height_cm,
    p.product_width_cm,
    CASE 
        WHEN p.product_length_cm IS NOT NULL 
            AND p.product_height_cm IS NOT NULL 
            AND p.product_width_cm IS NOT NULL
        THEN p.product_length_cm * p.product_height_cm * p.product_width_cm
        ELSE NULL
    END AS product_volume_cm3,
    current_timestamp() AS row_insert_timestamp,
    current_timestamp() AS row_update_timestamp
FROM (
    SELECT DISTINCT
        product_id,
        product_category_name,
        product_name_lenght,
        product_description_lenght,
        product_photos_qty,
        product_weight_g,
        product_length_cm,
        product_height_cm,
        product_width_cm
    FROM silver_shortcut.sl_products
) p
LEFT JOIN silver_shortcut.sl_product_category_translation t
    ON p.product_category_name = t.product_category_name;


-- =====================================================
-- DIM_SELLER
-- =====================================================

CREATE OR REPLACE TABLE dim_seller
USING DELTA
AS
WITH seller_base AS (
    SELECT DISTINCT
        seller_id,
        seller_city,
        seller_state
    FROM silver_shortcut.sl_sellers
),
seller_geo AS (
    SELECT 
        s.seller_id,
        s.seller_city,
        s.seller_state,
        g.geolocation_zip_code_prefix,
        g.geolocation_lat,
        g.geolocation_lng,
        ROW_NUMBER() OVER (PARTITION BY s.seller_id ORDER BY g.geolocation_lat) AS rn
    FROM seller_base s
    LEFT JOIN silver_shortcut.sl_geolocation g
        ON s.seller_city = g.geolocation_city
        AND s.seller_state = g.geolocation_state
)
SELECT 
    row_number() OVER (ORDER BY seller_id) AS seller_key,
    seller_id,
    seller_city,
    seller_state,
    geolocation_zip_code_prefix AS seller_zip_code_prefix,
    geolocation_lat AS seller_geolocation_lat,
    geolocation_lng AS seller_geolocation_lng,
    current_timestamp() AS row_insert_timestamp,
    current_timestamp() AS row_update_timestamp
FROM seller_geo
WHERE rn = 1;


-- =====================================================
-- DIM_ORDER
-- =====================================================

CREATE OR REPLACE TABLE dim_order
USING DELTA
AS
SELECT 
    row_number() OVER (ORDER BY o.order_id) AS order_key,
    o.order_id,
    o.order_status,
    o.order_purchase_timestamp,
    o.order_approved_at,
    o.order_delivered_carrier_date,
    o.order_delivered_customer_date,
    o.order_estimated_delivery_date,
    CASE 
        WHEN o.order_approved_at IS NOT NULL AND o.order_purchase_timestamp IS NOT NULL
        THEN datediff(o.order_approved_at, o.order_purchase_timestamp)
        ELSE NULL
    END AS days_to_approve,
    CASE 
        WHEN o.order_delivered_customer_date IS NOT NULL AND o.order_purchase_timestamp IS NOT NULL
        THEN datediff(o.order_delivered_customer_date, o.order_purchase_timestamp)
        ELSE NULL
    END AS days_to_deliver,
    CASE 
        WHEN o.order_delivered_customer_date IS NOT NULL AND o.order_estimated_delivery_date IS NOT NULL
        THEN datediff(o.order_delivered_customer_date, o.order_estimated_delivery_date)
        ELSE NULL
    END AS delivery_vs_estimate_days,
    current_timestamp() AS row_insert_timestamp,
    current_timestamp() AS row_update_timestamp
FROM (
    SELECT DISTINCT
        order_id,
        order_status,
        order_purchase_timestamp,
        order_approved_at,
        order_delivered_carrier_date,
        order_delivered_customer_date,
        order_estimated_delivery_date
    FROM silver_shortcut.sl_orders
) o;


-- =====================================================
-- DIM_PAYMENT
-- =====================================================

CREATE OR REPLACE TABLE dim_payment
USING DELTA
AS
SELECT 
    row_number() OVER (ORDER BY payment_type, payment_installments) AS payment_key,
    payment_type,
    payment_installments,
    current_timestamp() AS row_insert_timestamp,
    current_timestamp() AS row_update_timestamp
FROM (
    SELECT DISTINCT
        payment_type,
        payment_installments
    FROM silver_shortcut.sl_payments
    WHERE payment_type IS NOT NULL
) unique_payments
ORDER BY payment_type, payment_installments;


-- =====================================================
-- VERIFICATION
-- =====================================================

SELECT 
    'dim_date' AS dimension,
    COUNT(*) AS total_rows,
    COUNT(DISTINCT date_key) AS unique_keys,
    CASE 
        WHEN COUNT(*) = COUNT(DISTINCT date_key) 
        THEN '✅ UNIQUE' 
        ELSE '❌ DUPLICATES' 
    END AS status
FROM dim_date

UNION ALL

SELECT 
    'dim_customer',
    COUNT(*),
    COUNT(DISTINCT customer_key),
    CASE 
        WHEN COUNT(*) = COUNT(DISTINCT customer_key) 
            AND COUNT(*) = COUNT(DISTINCT customer_id)
        THEN '✅ UNIQUE' 
        ELSE '❌ DUPLICATES' 
    END
FROM dim_customer

UNION ALL

SELECT 
    'dim_product',
    COUNT(*),
    COUNT(DISTINCT product_key),
    CASE 
        WHEN COUNT(*) = COUNT(DISTINCT product_key) 
            AND COUNT(*) = COUNT(DISTINCT product_id)
        THEN '✅ UNIQUE' 
        ELSE '❌ DUPLICATES' 
    END
FROM dim_product

UNION ALL

SELECT 
    'dim_seller',
    COUNT(*),
    COUNT(DISTINCT seller_key),
    CASE 
        WHEN COUNT(*) = COUNT(DISTINCT seller_key) 
            AND COUNT(*) = COUNT(DISTINCT seller_id)
        THEN '✅ UNIQUE' 
        ELSE '❌ DUPLICATES' 
    END
FROM dim_seller

UNION ALL

SELECT 
    'dim_order',
    COUNT(*),
    COUNT(DISTINCT order_key),
    CASE 
        WHEN COUNT(*) = COUNT(DISTINCT order_key) 
            AND COUNT(*) = COUNT(DISTINCT order_id)
        THEN '✅ UNIQUE' 
        ELSE '❌ DUPLICATES' 
    END
FROM dim_order

UNION ALL

SELECT 
    'dim_payment',
    COUNT(*),
    COUNT(DISTINCT payment_key),
    CASE 
        WHEN COUNT(*) = COUNT(DISTINCT payment_key)
            AND COUNT(*) = COUNT(DISTINCT concat(payment_type, '-', CAST(payment_installments AS STRING)))
        THEN '✅ UNIQUE' 
        ELSE '❌ DUPLICATES' 
    END
FROM dim_payment;

StatementMeta(, 5e2c129c-7271-4284-9c8a-35e64fac8c4e, 8, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 6 rows and 4 fields>