In [1]:
from pyspark.sql.functions import monotonically_increasing_id, when, col

StatementMeta(, 517a08fb-b3f7-4986-a302-cde60bf609b7, 3, Finished, Available)

## **Dataframe**

In [2]:
category = spark.sql("SELECT * FROM LTT_SilverLakehouse.category")
customer_region = spark.sql("SELECT * FROM LTT_SilverLakehouse.customer_region")
delivery_risk = spark.sql("SELECT * FROM LTT_SilverLakehouse.delivery_risk")
delivery_status = spark.sql("SELECT * FROM LTT_SilverLakehouse.delivery_status")
department = spark.sql("SELECT * FROM LTT_SilverLakehouse.department")
desti_order = spark.sql("SELECT * FROM LTT_SilverLakehouse.desti_order")
order_status = spark.sql("SELECT * FROM LTT_SilverLakehouse.order_status")
product = spark.sql("SELECT * FROM LTT_SilverLakehouse.product")
shipping_mode = spark.sql("SELECT * FROM LTT_SilverLakehouse.shipping_mode")
transaction_type = spark.sql("SELECT * FROM LTT_SilverLakehouse.transaction_type")
orders = spark.sql("SELECT * FROM LTT_SilverLakehouse.orders")

StatementMeta(, 517a08fb-b3f7-4986-a302-cde60bf609b7, 4, Finished, Available)

In [3]:
# df = spark.sql("SELECT * FROM LTT_BronzeLakehouse.cleaned_df")
# display(df.head(10))

StatementMeta(, 517a08fb-b3f7-4986-a302-cde60bf609b7, 5, Finished, Available)

## **Mapping Tables**

In [4]:
def create_mapping_dataframe(df, column_name, column_name_id):
    unique_values = df.select(column_name).distinct()
    mapping_df = unique_values.withColumn(column_name_id, monotonically_increasing_id() + 1)
    return mapping_df.select(column_name_id, column_name)

StatementMeta(, 517a08fb-b3f7-4986-a302-cde60bf609b7, 6, Finished, Available)

In [5]:
order_status = create_mapping_dataframe(order_status, 'order_status', 'order_status_key')
shipping_mode = create_mapping_dataframe(shipping_mode, 'shipping_mode', 'shipping_mode_key')
delivery_status = create_mapping_dataframe(delivery_status, 'delivery_status', 'delivery_status_key')
delivery_risk = create_mapping_dataframe(delivery_risk, 'late_delivery_risk', 'delivery_risk_key')
transaction_type = create_mapping_dataframe(transaction_type, 'transaction_type', 'transaction_type_key')
delivery_risk = delivery_risk.withColumn(
    "delivery_risk_name",
    when(delivery_risk["late_delivery_risk"] == 1, "is late").otherwise("not late")
)
transaction_type.show()
delivery_status.show()
delivery_risk.show(5)
shipping_mode.show()
order_status.show()

StatementMeta(, 517a08fb-b3f7-4986-a302-cde60bf609b7, 7, Finished, Available)

+--------------------+----------------+
|transaction_type_key|transaction_type|
+--------------------+----------------+
|                   1|            CASH|
|                   2|        TRANSFER|
|                   3|         PAYMENT|
|                   4|           DEBIT|
+--------------------+----------------+

+-------------------+-----------------+
|delivery_status_key|  delivery_status|
+-------------------+-----------------+
|                  1| Shipping on time|
|                  2| Advance shipping|
|                  3|Shipping canceled|
|                  4|    Late delivery|
+-------------------+-----------------+

+-----------------+------------------+------------------+
|delivery_risk_key|late_delivery_risk|delivery_risk_name|
+-----------------+------------------+------------------+
|                1|                 1|           is late|
|                2|                 0|          not late|
+-----------------+------------------+------------------+

+--------

In [6]:
shipping_mode = shipping_mode.withColumn("shipping_mode_key", col("shipping_mode_key").cast("integer"))
delivery_status = delivery_status.withColumn("delivery_status_key", col("delivery_status_key").cast("integer")) 
transaction_type = transaction_type.withColumn("transaction_type_key", col("transaction_type_key").cast("integer")) 
order_status = order_status.withColumn("order_status_key", col("order_status_key").cast("integer"))
delivery_risk = delivery_risk.withColumn("delivery_risk_key", col("delivery_risk_key").cast("integer"))

StatementMeta(, 517a08fb-b3f7-4986-a302-cde60bf609b7, 8, Finished, Available)

In [7]:
# from pyspark.sql.functions import col

# # Assuming your DataFrame is named orders
# orders = orders.withColumn("shipping_mode_id", col("shipping_mode_id").cast("integer")) \
#     .withColumn("delivery_status_id", col("delivery_status_id").cast("integer")) \
#     .withColumn("transaction_type_id", col("transaction_type_id").cast("integer")) \
#     .withColumn("order_status_id", col("order_status_id").cast("integer")) \
#     .withColumn("delivery_risk_id", col("delivery_risk_id").cast("integer"))

# # Printing the schema to verify the changes
# orders.printSchema()


StatementMeta(, 517a08fb-b3f7-4986-a302-cde60bf609b7, 9, Finished, Available)

In [8]:
delivery_risk = delivery_risk.drop('late_delivery_risk')

StatementMeta(, 517a08fb-b3f7-4986-a302-cde60bf609b7, 10, Finished, Available)

In [9]:
transaction_type.show()
delivery_status.show()
delivery_risk.show(5)
shipping_mode.show()
order_status.show()

StatementMeta(, 517a08fb-b3f7-4986-a302-cde60bf609b7, 11, Finished, Available)

+--------------------+----------------+
|transaction_type_key|transaction_type|
+--------------------+----------------+
|                   1|            CASH|
|                   2|        TRANSFER|
|                   3|         PAYMENT|
|                   4|           DEBIT|
+--------------------+----------------+

+-------------------+-----------------+
|delivery_status_key|  delivery_status|
+-------------------+-----------------+
|                  1| Shipping on time|
|                  2| Advance shipping|
|                  3|Shipping canceled|
|                  4|    Late delivery|
+-------------------+-----------------+

+-----------------+------------------+
|delivery_risk_key|delivery_risk_name|
+-----------------+------------------+
|                1|           is late|
|                2|          not late|
+-----------------+------------------+

+-----------------+--------------+
|shipping_mode_key| shipping_mode|
+-----------------+--------------+
|                1

> Do abbreviate

In [10]:
abbreviated_status = (
    when(delivery_status.delivery_status == "Shipping on time", "On time")
    .when(delivery_status.delivery_status == "Advance shipping", "Advance")
    .when(delivery_status.delivery_status == "Shipping canceled", "Canceled")
    .when(delivery_status.delivery_status == "Late delivery", "Late")
    .otherwise(delivery_status.delivery_status)
)
delivery_status = delivery_status.withColumn("abbreviated_status", abbreviated_status)

StatementMeta(, 517a08fb-b3f7-4986-a302-cde60bf609b7, 12, Finished, Available)

In [11]:
display(delivery_status)

StatementMeta(, 517a08fb-b3f7-4986-a302-cde60bf609b7, 13, Finished, Available)

SynapseWidget(Synapse.DataFrame, 1feaf96b-72dc-44d2-9ef8-64dd65124670)

## **Load into Gold Lakehouse**

In [13]:
# delivery_status.write.format('delta').mode('overwrite').option("overwriteSchema", "true").saveAsTable('LTT_GoldLakehouse.dim_delivery_status')

StatementMeta(, 517a08fb-b3f7-4986-a302-cde60bf609b7, 15, Finished, Available)

In [None]:
delivery_risk.write.format('delta').mode('overwrite').saveAsTable('LTT_GoldLakehouse.dim_delivery_risk')
delivery_status.write.format('delta').mode('overwrite').saveAsTable('LTT_GoldLakehouse.dim_delivery_status')
transaction_type.write.format('delta').mode('overwrite').saveAsTable('LTT_GoldLakehouse.dim_transaction_type')
shipping_mode.write.format('delta').mode('overwrite').saveAsTable('LTT_GoldLakehouse.dim_shipping_mode')
order_status.write.format('delta').mode('overwrite').saveAsTable('LTT_GoldLakehouse.dim_order_status')

StatementMeta(, , , Cancelled, )