In [1]:
from pyspark.sql.functions import monotonically_increasing_id, when, col, row_number
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql import Row

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 3, Finished, Available)


## **Dataframe**

## **Mapping Table**

In [2]:
def create_mapping_dataframe(df, column_name, column_name_id):
    unique_values = df.select(column_name).distinct()
    mapping_df = unique_values.withColumn(column_name_id, monotonically_increasing_id() + 1)
    return mapping_df.select(column_name_id, column_name)

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 4, Finished, Available)

### **Transaction Type**

In [3]:
# bang cu va moi
transaction_table = spark.sql("SELECT * FROM LTT_SilverLakehouse.transaction_type")
old_transaction_table = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_transaction_type")


StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 5, Finished, Available)

In [4]:
print(transaction_table.count())
print(old_transaction_table.count())
transaction_table.show()
old_transaction_table.show()

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 6, Finished, Available)

4
4
+----------------+
|transaction_type|
+----------------+
|        TRANSFER|
|            CASH|
|         PAYMENT|
|           DEBIT|
+----------------+

+--------------------+----------------+
|transaction_type_key|transaction_type|
+--------------------+----------------+
|                   3|         PAYMENT|
|                   4|        TRANSFER|
|                   1|            CASH|
|                   2|           DEBIT|
+--------------------+----------------+



In [5]:
max_id = old_transaction_table.agg(F.max("transaction_type_key").alias("max_id")).collect()[0]["max_id"]
max_id

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 7, Finished, Available)

4

In [6]:
# filter new records and get the id for them
transaction_table = transaction_table.exceptAll(old_transaction_table.drop('transaction_type_key'))
# Define a window specification
window = Window.orderBy("transaction_type")  
# Add a new column 'product_key' with consecutive IDs starting from 1
transaction_table = transaction_table.withColumn("transaction_type_key", max_id +  row_number().over(window))

transaction_table = transaction_table.select('transaction_type_key', 'transaction_type')
transaction_table.show()

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 8, Finished, Available)

+--------------------+----------------+
|transaction_type_key|transaction_type|
+--------------------+----------------+
+--------------------+----------------+



### **Order Status**


In [10]:
# bang cu va moi
order_status_table = spark.sql("SELECT * FROM LTT_SilverLakehouse.order_status")
old_order_status = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_order_status")
print(order_status_table.count())
print(old_order_status.count())
order_status_table.show()
old_order_status.show()

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 12, Finished, Available)

9
9
+---------------+
|   order_status|
+---------------+
|PENDING_PAYMENT|
|       COMPLETE|
|        ON_HOLD|
| PAYMENT_REVIEW|
|     PROCESSING|
|         CLOSED|
|SUSPECTED_FRAUD|
|        PENDING|
|       CANCELED|
+---------------+

+----------------+---------------+
|order_status_key|   order_status|
+----------------+---------------+
|               3|       CANCELED|
|               4|        ON_HOLD|
|               5| PAYMENT_REVIEW|
|               6|        PENDING|
|               7|PENDING_PAYMENT|
|               8|     PROCESSING|
|               9|SUSPECTED_FRAUD|
|               1|       COMPLETE|
|               2|         CLOSED|
+----------------+---------------+



In [11]:
max_id = old_order_status.agg(F.max("order_status_key").alias("max_order_status_id")).collect()[0]["max_order_status_id"]

# new_row = Row(order_status='processing')
# order_status_table = order_status_table.union(sc.parallelize([new_row]).toDF())
# new_row = Row(order_status='processing2')
# order_status_table = order_status_table.union(sc.parallelize([new_row]).toDF())
# order_status_table.show()

# filter new records and get the id for them
order_status_table = order_status_table.exceptAll(old_order_status.drop('order_status_key'))
# Define a window specification
window = Window.orderBy("order_status")  
# Add a new column 'product_key' with consecutive IDs starting from 1
order_status_table = order_status_table.withColumn("order_status_key", max_id +  row_number().over(window))

order_status_table = order_status_table.select('order_status_key', 'order_status')
display(order_status_table)

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 13, Finished, Available)

SynapseWidget(Synapse.DataFrame, 437edf26-7aa3-412f-9db4-7b9bd04d2b53)

### **Delivery Status**

In [12]:
# bang cu va moi
delivery_status_table = spark.sql("SELECT * FROM LTT_SilverLakehouse.delivery_status")
old_delivery_status = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_delivery_status")
print(delivery_status_table.count())
print(old_delivery_status.count())
delivery_status_table.show()
old_delivery_status.show()

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 14, Finished, Available)

4
4
+-----------------+
|  delivery_status|
+-----------------+
| Shipping on time|
| Advance shipping|
|Shipping canceled|
|    Late delivery|
+-----------------+

+-------------------+-----------------+
|delivery_status_key|  delivery_status|
+-------------------+-----------------+
|                  3|Shipping canceled|
|                  4| Shipping on time|
|                  1| Advance shipping|
|                  2|    Late delivery|
+-------------------+-----------------+



In [14]:
max_id = old_delivery_status.agg(F.max("delivery_status_key").alias("max_id")).collect()[0]["max_id"]
max_id

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 16, Finished, Available)

4

In [15]:
# filter new records and get the id for them
delivery_status_table = delivery_status_table.exceptAll(old_delivery_status.drop('delivery_status_key'))
# Define a window specification
window_spec_key = Window.orderBy("delivery_status")  
# Add a new column 'product_key' with consecutive IDs starting from 1
delivery_status_table = delivery_status_table.withColumn("delivery_status_key", max_id +  row_number().over(window_spec_key))

delivery_status_table = delivery_status_table.select('delivery_status_key', 'delivery_status')
display(delivery_status_table)

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 17, Finished, Available)

SynapseWidget(Synapse.DataFrame, 8af20685-4a3b-45f9-a7a1-38ea9f38027b)

### **Shipping Mode**

In [16]:
# bang cu va moi
shipping_table  = spark.sql("SELECT * FROM LTT_SilverLakehouse.shipping_mode")
old_shipping_table = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_shipping_mode")
print(shipping_table.count())
print(old_shipping_table.count())

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 18, Finished, Available)

4
4


In [17]:
max_id = old_shipping_table.agg(F.max("shipping_mode_key").alias("max_id")).collect()[0]["max_id"]
max_id

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 19, Finished, Available)

4

In [18]:
# filter new records and get the id for them
shipping_table = shipping_table.exceptAll(old_shipping_table.drop('shipping_mode_key'))
# Define a window specification
window_spec_key = Window.orderBy("shipping_mode")  
# Add a new column 'product_key' with consecutive IDs starting from 1
shipping_table = shipping_table.withColumn("shipping_mode_key", max_id +  row_number().over(window_spec_key))

shipping_table = shipping_table.select('shipping_mode_key', 'shipping_mode')

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 20, Finished, Available)

In [19]:
display(shipping_table)

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 21, Finished, Available)

SynapseWidget(Synapse.DataFrame, 8babc5cb-59e7-4666-a5a3-b40dad5c36cf)

### **Delivery Risk**

In [20]:
# bang cu va moi
delivery_risk = spark.sql("SELECT * FROM LTT_SilverLakehouse.delivery_risk")
old_delivery_risk = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_delivery_risk")
delivery_risk.show()
old_delivery_risk.show()

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 22, Finished, Available)

+------------------+
|late_delivery_risk|
+------------------+
|                 1|
|                 0|
+------------------+

+-----------------+------------------+
|delivery_risk_key|delivery_risk_name|
+-----------------+------------------+
|                1|           is late|
|                2|          not late|
|                3|           unknown|
+-----------------+------------------+



In [21]:
from pyspark.sql import Row
new_row = Row(late_delivery_risk=2)
new_df = spark.createDataFrame([new_row])

# Append the new DataFrame to the existing one
delivery_risk = delivery_risk.union(new_df)


StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 23, Finished, Available)

In [22]:
delivery_risk = delivery_risk.withColumn("delivery_risk_name",
                                         when(delivery_risk["late_delivery_risk"] == 1, "is late")
                                         .when(delivery_risk["late_delivery_risk"] == 0, "not late")
                                         .otherwise("unknown"))

delivery_risk = delivery_risk.drop('late_delivery_risk')
display(delivery_risk)


StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 24, Finished, Available)

SynapseWidget(Synapse.DataFrame, d23e2646-8a45-484a-a6af-90807b936d3f)

In [23]:
max_id = old_delivery_risk.count()
max_id

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 25, Finished, Available)

3

In [24]:
old_delivery_risk.show()

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 26, Finished, Available)

+-----------------+------------------+
|delivery_risk_key|delivery_risk_name|
+-----------------+------------------+
|                1|           is late|
|                2|          not late|
|                3|           unknown|
+-----------------+------------------+



In [25]:
# filter new records and get the id for them
delivery_risk = delivery_risk.exceptAll(old_delivery_risk.drop('delivery_risk_key'))
# Define a window specification
window_spec_key = Window.orderBy("delivery_risk_name")  
# Add a new column 'product_key' with consecutive IDs starting from 1
delivery_risk = delivery_risk.withColumn("delivery_risk_key", max_id +  row_number().over(window_spec_key))

delivery_risk = delivery_risk.select('delivery_risk_key', 'delivery_risk_name')
delivery_risk.show()

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 27, Finished, Available)

+-----------------+------------------+
|delivery_risk_key|delivery_risk_name|
+-----------------+------------------+
+-----------------+------------------+



> We have only 2 delivery risk type. So just keep it simple and don't load any thing

## **Confirm data type**

In [26]:
shipping_table = shipping_table.withColumn("shipping_mode_key", col("shipping_mode_key").cast("integer"))
delivery_status_table = delivery_status_table.withColumn("delivery_status_key", col("delivery_status_key").cast("integer")) 
transaction_table = transaction_table.withColumn("transaction_type_key", col("transaction_type_key").cast("integer")) 
order_status_table = order_status_table.withColumn("order_status_key", col("order_status_key").cast("integer")) 
delivery_risk = delivery_risk.withColumn("delivery_risk_key", col("delivery_risk_key").cast("integer"))

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 28, Finished, Available)

## **Load into Silver Lakehouse**

In [27]:
# Printing counts for each table
print("The difference btw number of records")
print(f"Order Status Table Count: {order_status_table.count()}")
print(f"Delivery Risk Count: {delivery_risk.count()}")
print(f"Shipping Table Count: {shipping_table.count()}")
print(f"Transaction Table Count: {transaction_table.count()}")
print(f"Delivery Status Table Count: {delivery_status_table.count()}")

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 29, Finished, Available)

The difference btw number of records
Order Status Table Count: 0
Delivery Risk Count: 0
Shipping Table Count: 0
Transaction Table Count: 0
Delivery Status Table Count: 0


In [28]:
delivery_status_table.write.format('delta').mode('append').saveAsTable('LTT_GoldLakehouse.dim_delivery_status')
transaction_table.write.format('delta').mode('append').saveAsTable('LTT_GoldLakehouse.dim_transaction_type')
shipping_table.write.format('delta').mode('append').saveAsTable('LTT_GoldLakehouse.dim_shipping_mode')
order_status_table.write.format('delta').mode('append').saveAsTable('LTT_GoldLakehouse.dim_order_status')
delivery_risk.write.format('delta').mode('append').saveAsTable('LTT_GoldLakehouse.dim_delivery_risk') 

StatementMeta(, cc00cf3b-3acd-4d2c-8c14-34dc8a70ef3a, 30, Finished, Available)