In [1]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import *
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 3, Finished, Available)

## **Customer**

In [2]:
customer = spark.sql("SELECT * FROM LTT_SilverLakehouse.customer")
display(customer.head(5))

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 4, Finished, Available)

SynapseWidget(Synapse.DataFrame, 7bd8428b-0daf-4bd2-a857-4561ad24bc34)

In [3]:
customer.filter((customer.customer_fname == "James") & (customer.customer_lname == 'Smith')).show(10)

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 5, Finished, Available)

+-----------+--------------+--------------+----------------+
|customer_id|customer_fname|customer_lname|customer_segment|
+-----------+--------------+--------------+----------------+
|        419|         James|         Smith|        Consumer|
|        467|         James|         Smith|        Consumer|
|        673|         James|         Smith|       Corporate|
|       1000|         James|         Smith|       Corporate|
|       1072|         James|         Smith|        Consumer|
|       1424|         James|         Smith|        Consumer|
|       1449|         James|         Smith|        Consumer|
|       1585|         James|         Smith|        Consumer|
|       2220|         James|         Smith|       Corporate|
|       2411|         James|         Smith|        Consumer|
+-----------+--------------+--------------+----------------+
only showing top 10 rows



In [4]:
# Add valid_from column with date only
customer = customer.withColumn("valid_from", F.current_date())

# Define window specification for partitioning and ordering
window_spec = Window.partitionBy("customer_fname", "customer_lname").orderBy("valid_from")

# Add valid_to column with lagged valid_from column, coalesced to "9999-12-31"
customer = customer.withColumn("valid_to", F.lit("9999-12-31"))

# Convert valid_to column to date
customer = customer.withColumn("valid_to", F.to_date("valid_to"))

# Add is_valid column with True value
customer = customer.withColumn("is_valid", F.lit(True))


StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 6, Finished, Available)

In [5]:
display(customer.head(5))

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 7, Finished, Available)

SynapseWidget(Synapse.DataFrame, 53f20a1f-b7c9-44ef-8e7a-75f40e66256c)

In [6]:
customer = customer.orderBy('customer_id')

customer = customer.withColumn("customer_key" , monotonically_increasing_id() + 1)
customer = customer.withColumn("customer_key" , col("customer_key").cast("int"))
customer = customer.select('customer_key', 'customer_id', 'customer_fname', 'customer_lname', 'customer_segment', 'valid_from', 'valid_to', 'is_valid').orderBy('customer_key')

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 8, Finished, Available)

## **Product**


In [7]:
product = spark.sql("SELECT * FROM LTT_SilverLakehouse.product")

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 9, Finished, Available)

In [8]:
product = product.withColumn("valid_from", F.current_date())
window_spec = Window.partitionBy("product_id", "product_name").orderBy("valid_from")
product = product.withColumn("valid_to", F.lit("9999-12-31"))
product = product.withColumn("valid_to", F.to_date("valid_to"))
product = product.withColumn("is_valid", F.lit(True))



StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 10, Finished, Available)

In [9]:
product = product.orderBy('product_id')

product = product.withColumn("product_key" , monotonically_increasing_id() + 1)
product = product.withColumn("product_key" , col("product_key").cast("int"))
product = product.select('product_key', 'product_id', 'product_category_id', 'product_name', 'product_price', 'product_status', 'valid_from', 'valid_to','is_valid')

print(product.count())

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 11, Finished, Available)

118


In [10]:
display(product.head(5))

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 12, Finished, Available)

SynapseWidget(Synapse.DataFrame, 52923e38-310f-4374-9f6e-0ea5b2098635)

## **Product Category Table**

In [11]:
category = spark.sql("SELECT * FROM LTT_SilverLakehouse.category")
# display(category_table.head(10))

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 13, Finished, Available)

In [12]:
category = category.withColumn("valid_from", F.current_date())
window_spec_category = Window.partitionBy("product_category_id", "category_name").orderBy("valid_from")
category = category.withColumn("valid_to", F.lit("9999-12-31"))
category = category.withColumn("valid_to", F.to_date("valid_to"))
category = category.withColumn("is_valid", F.lit(True))

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 14, Finished, Available)

In [13]:
category = category.orderBy('product_category_id')

category = category.withColumn("category_key", monotonically_increasing_id() + 1)
category = category.withColumn("category_key" , col("category_key").cast("int"))
category = category.select("category_key", 'product_category_id', 'category_name', 'department_id', 'valid_from', 'valid_to', 'is_valid')

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 15, Finished, Available)

## **Department**

In [14]:
department_table = spark.sql("SELECT * FROM LTT_SilverLakehouse.department")
# display(department_table)

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 16, Finished, Available)

In [15]:
department_table = department_table.withColumn('valid_from', F.current_date())
window_spec_department = Window.partitionBy('department_id', 'department_name').orderBy('valid_from')
department_table = department_table.withColumn('valid_to', F.lit("9999-12-31"))
department_table = department_table.withColumn('valid_to', F.to_date('valid_to'))
department_table = department_table.withColumn('is_valid', F.lit(True))

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 17, Finished, Available)

In [16]:
department_table = department_table.orderBy('department_id')

department_table = department_table.withColumn("department_key", monotonically_increasing_id() + 1)
department_table = department_table.withColumn("department_key" , col("department_key").cast("int"))
department_table = department_table.select('department_key', 'department_id', 'department_name', 'valid_from', 'valid_to', 'is_valid')

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 18, Finished, Available)

In [21]:
department_table.show()

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 23, Finished, Available)

+--------------+-------------+------------------+----------+----------+--------+
|department_key|department_id|   department_name|valid_from|  valid_to|is_valid|
+--------------+-------------+------------------+----------+----------+--------+
|             1|            2|           Fitness|2024-06-18|9999-12-31|    true|
|             2|            3|          Footwear|2024-06-18|9999-12-31|    true|
|             3|            4|           Apparel|2024-06-18|9999-12-31|    true|
|             4|            5|              Golf|2024-06-18|9999-12-31|    true|
|             5|            6|          Outdoors|2024-06-18|9999-12-31|    true|
|             6|            7|          Fan Shop|2024-06-18|9999-12-31|    true|
|             7|            8|         Book Shop|2024-06-18|9999-12-31|    true|
|             8|            9|        Discs Shop|2024-06-18|9999-12-31|    true|
|             9|           10|        Technology|2024-06-18|9999-12-31|    true|
|            10|           1

In [25]:
department_table = \
department_table.withColumn(
    "department_name_abb",
    when(
        col("department_name").isin("Fan Shop", "Golf", "Apparel", "Footwear"),
        col("department_name")
    ).otherwise("Others")
)

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 27, Finished, Available)

In [26]:
department_table.show()

StatementMeta(, 3331cb6d-2ef4-49db-8fa5-dc51c9be930b, 28, Finished, Available)

+--------------+-------------+------------------+----------+----------+--------+-------------------+
|department_key|department_id|   department_name|valid_from|  valid_to|is_valid|department_name_abb|
+--------------+-------------+------------------+----------+----------+--------+-------------------+
|             1|            2|           Fitness|2024-06-18|9999-12-31|    true|             Others|
|             2|            3|          Footwear|2024-06-18|9999-12-31|    true|           Footwear|
|             3|            4|           Apparel|2024-06-18|9999-12-31|    true|            Apparel|
|             4|            5|              Golf|2024-06-18|9999-12-31|    true|               Golf|
|             5|            6|          Outdoors|2024-06-18|9999-12-31|    true|             Others|
|             6|            7|          Fan Shop|2024-06-18|9999-12-31|    true|           Fan Shop|
|             7|            8|         Book Shop|2024-06-18|9999-12-31|    true|           

## **Load to Table**

In [None]:
customer.write.format('delta').mode('overwrite').saveAsTable('LTT_GoldLakehouse.dim_customer')
product.write.format('delta').mode('overwrite').saveAsTable('LTT_GoldLakehouse.dim_product')
category.write.format('delta').mode('overwrite').saveAsTable('LTT_GoldLakehouse.dim_category')
department_table.write.format('delta').mode('overwrite').saveAsTable('LTT_GoldLakehouse.dim_department')

StatementMeta(, , , Cancelled, )