In [1]:
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.functions import monotonically_increasing_id
import pandas as pd
import numpy as np
import datetime
from pyspark.sql.window import Window

StatementMeta(, 8d5af86b-de49-47c4-9f0e-ca27ef8ba742, 3, Finished, Available)

In [2]:
df = spark.sql("SELECT * FROM LTT_BronzeLakehouse.cleaned_df")
df.printSchema()

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 4, Finished, Available)

root
 |-- transaction_type: string (nullable = true)
 |-- days_for_shipping_real: integer (nullable = true)
 |-- days_for_shipment_scheduled: integer (nullable = true)
 |-- benefit_per_order: double (nullable = true)
 |-- sales_per_customer: double (nullable = true)
 |-- delivery_status: string (nullable = true)
 |-- late_delivery_risk: integer (nullable = true)
 |-- category_id: integer (nullable = true)
 |-- category_name: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_country: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- customer_lname: string (nullable = true)
 |-- customer_password: string (nullable = true)
 |-- customer_segment: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- customer_street: string (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- department_name: string (nulla

In [3]:
df.count()

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 5, Finished, Available)

5322

### **0. Mapping Tables**

In [4]:
# Order Status
order_status_table = df.select('order_status').distinct()
# Shipping Table
shipping_table = df.select('shipping_mode').distinct()
# Transaction Type 
transaction_table = df.select('transaction_type').distinct()
# Delivery Status
delivery_status_table = df.select('delivery_status').distinct()
# Delivery Risk
delivery_risk = df.select('late_delivery_risk').distinct()

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 6, Finished, Available)

In [5]:
# # append new row data
# # Check the previous order status table. If there is anything new, add it to a df and
# # then append it into the order status table
# available_order_status = spark.sql("SELECT * FROM LTT_BronzeLakehouse.order_status")
# order_status_table = order_status_table.exceptAll(available_order_status)

# available_shipping_table = spark.sql("SELECT * FROM LTT_BronzeLakehouse.shipping")
# shipping_table = shipping_table.exceptAll(available_shipping_table)

# available_transaction_type = spark.sql("SELECT * FROM LTT_BronzeLakehouse.transaction")
# transaction_table = transaction_table.exceptAll(available_transaction_type)

# available_delivery_risk = spark.sql("SELECT * FROM LTT_BronzeLakehouse.delivery_risk")
# delivery_risk = delivery_risk.exceptAll(available_delivery_risk)

# available_delivery_status = spark.sql("SELECT * FROM LTT_BronzeLakehouse.delivery_status")
# delivery_status_table = delivery_status_table.exceptAll(available_delivery_status)

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 7, Finished, Available)

### **1. Customer Table**

In [6]:
# # Load customer table which is available when prevision load
# ava_customer =  spark.sql("SELECT * FROM LTT_BronzeLakehouse.customer")
# print(ava_customer.count())

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 8, Finished, Available)

In [7]:
# Load new customers
customer_columns = ['customer_id', 
            'customer_fname', 
            'customer_lname',
            'customer_segment']
customer_table = df.select(customer_columns)
print(customer_table.count())
customer_table = customer_table.dropDuplicates(subset = customer_columns)
print(customer_table.count()) # After Dropping Duplication
customer_table = customer_table.orderBy('customer_id')
customer_table.show(5)

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 9, Finished, Available)

5322
1668
+-----------+--------------+--------------+----------------+
|customer_id|customer_fname|customer_lname|customer_segment|
+-----------+--------------+--------------+----------------+
|         12|   Christopher|         Smith|       Corporate|
|         13|          Mary|       Baldwin|     Home Office|
|         16|       Tiffany|         Smith|       Corporate|
|         17|          Mary|      Robinson|        Consumer|
|         19|     Stephanie|      Mitchell|     Home Office|
+-----------+--------------+--------------+----------------+
only showing top 5 rows



In [8]:
# # Trừ những customer đã có trong nguồn, load những khách hàng mới vào dataset
# customer_table = customer_table.exceptAll(ava_customer)

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 10, Finished, Available)

In [9]:
customer_table.count()

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 11, Finished, Available)

1668

### **2. Customer Region Table**

In [10]:
# # Load customer table which is available when full load
# ava_customer_region =  spark.sql("SELECT * FROM LTT_BronzeLakehouse.customer_region")
# print(ava_customer_region.count())

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 12, Finished, Available)

In [11]:
customer_region_columns = ["concat_customer_region", "customer_country", "customer_state", "customer_city", "customer_street", "longitude", "latitude"]

customer_region_table = df.select(customer_region_columns)
customer_region_table = customer_region_table.dropDuplicates(subset=["concat_customer_region"])

print(customer_region_table.count())
# print(customer_region_table.count())
# customer_region_table.show(5)

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 13, Finished, Available)

1646


In [12]:
# customer_region_table = customer_region_table.orderBy('concat_customer_region')
# customer_region_table.count()

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 14, Finished, Available)

In [13]:
# ava_customer_region.createOrReplaceTempView('ava_customer_region')
# customer_region_table.createOrReplaceTempView('customer_region_table')
# # Làm tương tự, filter ra những records không có trong previous df để append vào
# customer_region_table = spark.sql("""
#     SELECT * FROM customer_region_table 
#     WHERE concat_customer_region 
#     NOT IN (
#         SELECT concat_customer_region 
#         FROM ava_customer_region
#     )
# """)


StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 15, Finished, Available)

In [14]:
customer_region_table.count()

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 16, Finished, Available)

1646

### **3. Destination Order Table**

In [15]:
# # Load customer table which is available when full load
# ava_desti_order =  spark.sql("SELECT * FROM LTT_BronzeLakehouse.desti_order")
# print(ava_desti_order.count())

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 17, Finished, Available)

In [16]:
desti_columns = ["concat_destination_address", "market", "order_region", "order_country", "order_state", "order_city"]
desti_order_table = df.select(desti_columns).dropDuplicates(subset=desti_columns)
desti_order_table = desti_order_table.orderBy('concat_destination_address')
desti_order_table.count()

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 18, Finished, Available)

312

In [17]:
# distinct_concat_destinations = ava_desti_order.select("concat_destination_address").distinct()

# # Filter desti_order_table DataFrame to exclude rows where concat_destination_address is in ava_desti_order
# desti_order_table = desti_order_table.join(
#     distinct_concat_destinations,
#     desti_order_table.concat_destination_address == distinct_concat_destinations.concat_destination_address,
#     "left_anti"
# )

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 19, Finished, Available)

In [18]:
# ava_desti_order.createOrReplaceTempView('ava_desti_order')
# desti_order_table.createOrReplaceTempView('desti_order_table')
# # Làm tương tự, filter ra những records không có trong df fullload để append vào
# desti_order_table = spark.sql("""
#     SELECT * FROM desti_order_table
#     WHERE concat_destination_address 
#     NOT IN (
#         SELECT concat_destination_address 
#         FROM ava_desti_order
#     )
# """)


StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 20, Finished, Available)

In [19]:
desti_order_table.count()

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 21, Finished, Available)

312

### **4. Product Table**

In [20]:
product_columns = ["product_id" , "product_category_id",  "product_name", "product_price", "product_status"]
product_table = df.select(product_columns)
product_table = product_table.dropDuplicates() \
                            .orderBy('product_id')
print(product_table.count())
product_table.show(5)


StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 22, Finished, Available)

54
+----------+-------------------+--------------------+-------------+--------------+
|product_id|product_category_id|        product_name|product_price|product_status|
+----------+-------------------+--------------------+-------------+--------------+
|        37|                  3|adidas Kids' F5 M...|  34.99000168|             0|
|        44|                  3|adidas Men's F10 ...|  59.99000168|             0|
|        93|                  5|Under Armour Men'...|  24.98999977|             0|
|       116|                  6|Nike Men's Comfor...|  44.99000168|             0|
|       134|                  7|Nike Women's Lege...|         25.0|             0|
+----------+-------------------+--------------------+-------------+--------------+
only showing top 5 rows



### **5. Product Category Table**

In [21]:
category_columns = ["product_category_id", "category_name", "department_id"]
category_table = df.select(category_columns)
category_table = category_table.dropDuplicates() \
                                .orderBy('product_category_id')
print(category_table.count())
category_table.show(5)

# category_table = category_table.exceptAll(ava_category)
# category_table.count()

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 23, Finished, Available)

25
+-------------------+-------------------+-------------+
|product_category_id|      category_name|department_id|
+-------------------+-------------------+-------------+
|                  3|Baseball & Softball|            2|
|                  5|           Lacrosse|            2|
|                  6|   Tennis & Racquet|            2|
|                  7|             Hockey|            2|
|                  9|   Cardio Equipment|            3|
+-------------------+-------------------+-------------+
only showing top 5 rows



### **6. Department Table**

In [25]:
# ava_department =  spark.sql("SELECT * FROM LTT_BronzeLakehouse.department")
# print(ava_department.count())

StatementMeta(, 8d5af86b-de49-47c4-9f0e-ca27ef8ba742, 27, Finished, Available)

In [22]:
department_columns = ["department_id", "department_name"]
department_table  = df.select(department_columns)
department_table = department_table.dropDuplicates()
department_table = department_table.orderBy('department_id')
department_table.show()

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 24, Finished, Available)

+-------------+---------------+
|department_id|department_name|
+-------------+---------------+
|            2|        Fitness|
|            3|       Footwear|
|            4|        Apparel|
|            5|           Golf|
|            6|       Outdoors|
|            7|       Fan Shop|
+-------------+---------------+



In [27]:
# department_table = department_table.exceptAll(ava_department)
# department_table.count()

StatementMeta(, 8d5af86b-de49-47c4-9f0e-ca27ef8ba742, 29, Finished, Available)

### **Load to Table in Bronze Layer(not UTF-8 Format)**

In [25]:
# Printing counts for each table
print("Number of new records: ")
print(f"Order Status Table Count: {order_status_table.count()}")
print(f"Delivery Risk Count: {delivery_risk.count()}")
print(f"Shipping Mode Table Count: {shipping_table.count()}")
print(f"Transaction Type Table Count: {transaction_table.count()}")
print(f"Delivery Status Table Count: {delivery_status_table.count()}")

print(f"Customer Table Count: {customer_table.count()}")
print(f"Store Table Count: {customer_region_table.count()}")
print(f"Destination Count: {desti_order_table.count()}")
print(f"Product Table Count: {product_table.count()}")
print(f"Category Table Count: {category_table.count()}")
print(f"Department Table Count: {department_table.count()}")

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 27, Finished, Available)

Number of new records: 
Order Status Table Count: 9
Delivery Risk Count: 2
Shipping Mode Table Count: 4
Transaction Type Table Count: 4
Delivery Status Table Count: 4
Customer Table Count: 1668
Store Table Count: 1646
Destination Count: 312
Product Table Count: 54
Category Table Count: 25
Department Table Count: 6


In [27]:
order_status_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.order_status')
delivery_risk.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.delivery_risk')
shipping_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.shipping_mode')
transaction_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.transaction_type')
delivery_status_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.delivery_status')

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 29, Finished, Available)

In [28]:
customer_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.customer')
customer_region_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.customer_region')
desti_order_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.desti_order')
product_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.product')
category_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.category')
department_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.department')

StatementMeta(, 4d0e1dbf-902a-4011-8c33-f05c93cdd20c, 30, Finished, Available)