In [0]:
%sql
Drop schema if exists etl.gold cascade;
create schema etl.gold

In [0]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number, when, col

spark.sql("DROP TABLE IF EXISTS etl.gold.dim_customers") 

df_crm_cust_info = (spark.read.table("etl.silver.crm_cust_info"))
df_erp_cust_az12 = (spark.read.table("etl.silver.erp_cust_az12"))
df_erp_loc_a101 = (spark.read.table("etl.silver.erp_loc_a101"))

df = (df_crm_cust_info
      .join(df_erp_cust_az12, df_crm_cust_info.cst_key == df_erp_cust_az12.cid, "left")
    )

df = (df
      .join(df_erp_loc_a101, df_crm_cust_info.cst_key == df_erp_loc_a101.cid, "left")
    )

df = (df
      .withColumn("customer_key", row_number().over(Window.orderBy("cst_id")))
      .withColumnRenamed("cst_id", "customer_id")
      .withColumnRenamed("cst_key", "customer_number")
      .withColumnRenamed("cst_firstname", "first_name")
      .withColumnRenamed("cst_lastname", "last_name")
      .withColumnRenamed("cntry", "country")
      .withColumnRenamed("cst_marital_status", "marital_status")
      .withColumnRenamed("bdate", "birthdate")
      .withColumnRenamed("cst_create_date", "create_date")
      .withColumn("gender",
                  when(col("cst_gndr") != "n/a", col("cst_gndr"))
                  .otherwise(col("gen")))
      .select("customer_key", "customer_id", "customer_number", "first_name", "last_name", "country", "marital_status", "gender", "birthdate", "create_date")
      )

df.write.mode("overwrite").format("delta").saveAsTable("etl.gold.dim_customers")

In [0]:
from pyspark.sql.functions import filter

spark.sql("DROP TABLE IF EXISTS etl.gold.dim_products") 

df_crm_prd_info = (spark.read.table("etl.silver.crm_prd_info"))
df_erp_px_cat_g1v2 = (spark.read.table("etl.silver.erp_px_cat_g1v2"))

df = (df_crm_prd_info
      .join(df_erp_px_cat_g1v2, df_crm_prd_info.cat_id == df_erp_px_cat_g1v2.id, "left")
    )

df = (df
      .withColumn("product_key", row_number().over(Window.orderBy("prd_start_dt","prd_key")))
      .withColumnRenamed("prd_id", "product_id")
      .withColumnRenamed("prd_key", "product_number")
      .withColumnRenamed("prd_nm", "product_name")
      .withColumnRenamed("cat_id", "category_id")
      .withColumnRenamed("cat", "category")
      .withColumnRenamed("subcat", "subcategory")
      .withColumnRenamed("maintenance", "maintenance")
      .withColumnRenamed("prd_cost", "cost")
      .withColumnRenamed("prd_line", "product_line")
      .withColumnRenamed("prd_start_dt", "start_date")
      .filter(col("prd_end_dt").isNull())
      .select("product_key", "product_id", "product_number", "product_name", "category_id", "category", "subcategory", "maintenance","cost", "product_line", "start_date")
    )

df.write.mode("overwrite").format("delta").saveAsTable("etl.gold.dim_products")

In [0]:
spark.sql("DROP TABLE IF EXISTS etl.gold.fact_sales")

df_crm_sales_details = (spark.read.table("etl.silver.crm_sales_details"))
df_dim_products = (spark.read.table("etl.gold.dim_products"))
df_dim_customers = (spark.read.table("etl.gold.dim_customers"))

df = (df_crm_sales_details
      .join(df_dim_products, df_crm_sales_details.sls_prd_key == df_dim_products.product_number, "left")
    )

df = (df
      .join(df_dim_customers, df_crm_sales_details.sls_cust_id == df_dim_customers.customer_id, "left")
    )

df = (df
     .withColumnRenamed("sls_ord_num","order_number")
     .withColumnRenamed("product_key","product_key")
     .withColumnRenamed("customer_key","customer_key")
     .withColumnRenamed("sls_order_dt","order_date")
     .withColumnRenamed("sls_ship_dt","shipping_date")
     .withColumnRenamed("sls_due_dt","due_date")
     .withColumnRenamed("sls_sales","sales_amount")
     .withColumnRenamed("sls_quantity","quantity")
     .withColumnRenamed("sls_price","price")
     .select("order_number", "product_key", "customer_key", "order_date", "shipping_date", "due_date", "sales_amount", "quantity", "price")
    )

df.write.mode("overwrite").format("delta").saveAsTable("etl.gold.fact_sales")