In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window as W
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [0]:
%python
dbutils.fs.ls("/mnt/salesData/silver")

[FileInfo(path='dbfs:/mnt/salesData/silver/crm_cust_info/', name='crm_cust_info/', size=0, modificationTime=1749581400000),
 FileInfo(path='dbfs:/mnt/salesData/silver/crm_prd_info/', name='crm_prd_info/', size=0, modificationTime=1749581402000),
 FileInfo(path='dbfs:/mnt/salesData/silver/crm_sales_detail_info/', name='crm_sales_detail_info/', size=0, modificationTime=1749581396000),
 FileInfo(path='dbfs:/mnt/salesData/silver/erp_az/', name='erp_az/', size=0, modificationTime=1749581404000),
 FileInfo(path='dbfs:/mnt/salesData/silver/erp_loc/', name='erp_loc/', size=0, modificationTime=1749581403000),
 FileInfo(path='dbfs:/mnt/salesData/silver/erp_px/', name='erp_px/', size=0, modificationTime=1749581405000)]

In [0]:
crm_cust_info = spark.read.csv("dbfs:/mnt/salesData/silver/crm_cust_info/", header=True, inferSchema=True)
crm_prd_info = spark.read.csv("dbfs:/mnt/salesData/silver/crm_prd_info", header=True, inferSchema=True)
crm_sales = spark.read.csv("dbfs:/mnt/salesData/silver/crm_sales_detail_info", header=True, inferSchema=True)
erp_az = spark.read.csv("dbfs:/mnt/salesData/silver/erp_az", header=True, inferSchema=True)
erp_loc = spark.read.csv("dbfs:/mnt/salesData/silver/erp_loc", header=True, inferSchema=True)
erp_px = spark.read.csv("dbfs:/mnt/salesData/silver/erp_px", header=True, inferSchema=True)

In [0]:
# join the product tables to form a dimension table with unique id
crm_prd_info.createOrReplaceTempView("view_prd_info")
erp_px.createOrReplaceTempView("view_px")

# join the customer tables to form a dimension table with unique id
crm_cust_info.createOrReplaceTempView("view_cust_info")
erp_az.createOrReplaceTempView("view_az")
erp_loc.createOrReplaceTempView("view_loc")

# sales fact tables to form a fact table with proper business familiar names

crm_sales.createOrReplaceTempView("view_sales")

In [0]:
dim_customer =spark.sql("""
        SELECT
            cf.cst_id AS customer_id,
            cf.cst_key AS customer_key,
            cf.cst_firstname AS firstname,
            cf.cst_lastname AS lastname,
            CASE 
                WHEN cf.cst_gndr = 'N/A' THEN coalesce(ca.GEN,'N/A')
                ELSE cf.cst_gndr
            END gender,
            cf.cst_marital_status AS marital_status,
            la.CNTRY AS country,
            ca.BDATE AS birthdate,
            cf.cst_create_date AS customer_created_date
        FROM view_cust_info cf 
        LEFT JOIN view_az ca 
        ON cf.cst_key = ca.CID 
        LEFT JOIN view_loc la 
        ON cf.cst_key = la.CID 
    """
    
)

In [0]:
dim_product =spark.sql("""
    SELECT 
        pf.prd_id AS product_id,
        pf.prd_key AS product_key,
        pf.prd_cat_key AS product_category_key,
        pf.prd_nm AS product_name,
        pf.prd_line AS product_line,
        pf.prd_cost AS product_cost,
        pf.prd_start_dt AS product_start_date,
        pf.prd_end_dt AS product_end_date,
        pcg.cat AS product_category,
        pcg.maintenance,
        pcg.subcat AS product_subcategory
    FROM view_prd_info pf 
    LEFT JOIN view_px pcg 
        ON pf.prd_cat_key = pcg.id
    WHERE pf.prd_end_dt IS NOT NULL
""")


In [0]:
fact_sales =spark.sql("""
    SELECT DISTINCT
        sls_ord_num AS order_number,
        sls_prd_key AS product_key,
        sls_cust_id AS customer_id,
        sls_order_dt AS order_date,
        sls_ship_dt AS ship_date,
        sls_due_dt AS due_date,
        sls_sales AS sales_amount,
        sls_quantity AS sales_quantity,
        sls_price AS sales_price
    FROM view_sales
""")

In [0]:
# write all transform data as csv to gold dir
def write_to_gold(df,table_name):
    try:
        df.write.mode("overwrite").format("csv").option("header",True).save(f"/mnt/salesData/gold/{table_name}")
        print(f"{table_name} written to gold")
    except Exception as e:
        print(f"error in writing {table_name} to gold",e)
      

In [0]:
write_to_gold(dim_customer,'dim_customer')
write_to_gold(dim_product,'dim_product')
write_to_gold(fact_sales,'fact_sales')



dim_customer written to silver
dim_product written to silver
fact_sales written to silver
