In [1]:
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.functions import monotonically_increasing_id
import pandas as pd
import numpy as np
import datetime
from pyspark.sql.window import Window

StatementMeta(, 5040a61d-4aae-44a6-b247-9d269448a5fc, 3, Finished, Available)

In [2]:
df = spark.sql("SELECT * FROM LTT_BronzeLakehouse.cleaned_df")
# df.printSchema()

StatementMeta(, 5040a61d-4aae-44a6-b247-9d269448a5fc, 4, Finished, Available)

### **Orders**

In [3]:
order_columns = ['order_id', 'customer_id', 'product_id', \
    'shipping_mode', 'delivery_status', 'transaction_type', 'order_status', 'late_delivery_risk', \
    'concat_destination_address', 'concat_customer_region', \
    'order_date', 'shipping_date', \
    'order_item_quantity', 'order_item_discount', \
    'sales', 'benefit_per_order']
orders = df.select(order_columns)

orders.write.format('delta').mode('overwrite').saveAsTable('LTT_SilverLakehouse.orders')

StatementMeta(, 5040a61d-4aae-44a6-b247-9d269448a5fc, 5, Finished, Available)

### **0. Mapping Tables**

In [4]:
# Order Status
order_status_table = df.select('order_status').distinct()
# Shipping Table
shipping_table = df.select('shipping_mode').distinct()
# Transaction Type 
transaction_table = df.select('transaction_type').distinct()
# Delivery Status
delivery_status_table = df.select('delivery_status').distinct()
# Delivery Risk
delivery_risk = df.select('late_delivery_risk').distinct()

StatementMeta(, 5040a61d-4aae-44a6-b247-9d269448a5fc, 6, Finished, Available)

In [5]:
order_status_table.show()
shipping_table.show()
transaction_table.show()
delivery_status_table.show()
delivery_risk.show()

StatementMeta(, 5040a61d-4aae-44a6-b247-9d269448a5fc, 7, Finished, Available)

+------------+
|order_status|
+------------+
|    COMPLETE|
|      CLOSED|
+------------+

+--------------+
| shipping_mode|
+--------------+
|Standard Class|
+--------------+

+----------------+
|transaction_type|
+----------------+
|            CASH|
|           DEBIT|
+----------------+

+----------------+
| delivery_status|
+----------------+
|Advance shipping|
|   Late delivery|
+----------------+

+------------------+
|late_delivery_risk|
+------------------+
|                 1|
|                 0|
+------------------+



### **1. Customer Table**

In [6]:
customer_columns = ['customer_id', 
            'customer_fname', 
            'customer_lname',
            'customer_segment']
customer_table = df.select(customer_columns)
print(customer_table.count())
customer_table = customer_table.dropDuplicates(subset = customer_columns)
print(customer_table.count()) # After Dropping Duplication
customer_table = customer_table.orderBy('customer_id')
customer_table.show(5)

StatementMeta(, 9e60fb1f-1f51-460d-925b-585ee758b758, 8, Finished, Available)

5322
1668
+-----------+--------------+--------------+----------------+
|customer_id|customer_fname|customer_lname|customer_segment|
+-----------+--------------+--------------+----------------+
|         12|   Christopher|         Smith|       Corporate|
|         13|          Mary|       Baldwin|     Home Office|
|         16|       Tiffany|         Smith|       Corporate|
|         17|          Mary|      Robinson|        Consumer|
|         19|     Stephanie|      Mitchell|     Home Office|
+-----------+--------------+--------------+----------------+
only showing top 5 rows



### **2. Customer Region Table**

In [7]:
customer_region_columns = ["concat_customer_region", "customer_country", "customer_state", "customer_city", "customer_street", "longitude", "latitude"]

customer_region_table = df.select(customer_region_columns)
customer_region_table = customer_region_table.dropDuplicates(subset=["concat_customer_region"])

print(customer_region_table.count())
# print(customer_region_table.count())
# customer_region_table.show(5)

StatementMeta(, 9e60fb1f-1f51-460d-925b-585ee758b758, 9, Finished, Available)

1646


In [8]:
customer_region_table = customer_region_table.orderBy('concat_customer_region')
customer_region_table.show(5)

StatementMeta(, 9e60fb1f-1f51-460d-925b-585ee758b758, 10, Finished, Available)

+----------------------+----------------+--------------+-------------+------------------+------------+-----------+
|concat_customer_region|customer_country|customer_state|customer_city|   customer_street|   longitude|   latitude|
+----------------------+----------------+--------------+-------------+------------------+------------+-----------+
|  EE. UU.ARConway77...|         EE. UU.|            AR|       Conway|       Quiet Mall |-92.43454742|35.09189606|
|  EE. UU.ARJonesbor...|         EE. UU.|            AR|    Jonesboro|Middle Gate Towers|-90.65527344|35.85791779|
|  EE. UU.ARJonesbor...|         EE. UU.|            AR|    Jonesboro|       Dewy Villas|-97.45480347|25.91904068|
|  EE. UU.AZChandler...|         EE. UU.|            AZ|     Chandler|Hidden Anchor Port|-111.9437943|33.30524445|
|  EE. UU.AZGlendale...|         EE. UU.|            AZ|     Glendale|       Foggy Glade|-112.1931229|33.66019821|
+----------------------+----------------+--------------+-------------+----------

### **3. Destination Order Table**

In [9]:
desti_columns = ["concat_destination_address", "market", "order_region", "order_country", "order_state", "order_city"]
desti_order_table = df.select(desti_columns).dropDuplicates(subset=desti_columns)
desti_order_table = desti_order_table.orderBy('concat_destination_address')
desti_order_table.show(5)

StatementMeta(, 9e60fb1f-1f51-460d-925b-585ee758b758, 11, Finished, Available)

+--------------------------+------+---------------+-------------+-----------+-----------------+
|concat_destination_address|market|   order_region|order_country|order_state|       order_city|
+--------------------------+------+---------------+-------------+-----------+-----------------+
|      Abreu e LimaPerna...| LATAM|  South America|       Brasil| Pernambuco|     Abreu e Lima|
|      AcayucanVeracruzM...| LATAM|Central America|       México|   Veracruz|         Acayucan|
|      AmatitlánGuatemal...| LATAM|Central America|    Guatemala|  Guatemala|        Amatitlán|
|      AnanindeuaParáBra...| LATAM|  South America|       Brasil|       Pará|       Ananindeua|
|      Antiguo Cuscatlán...| LATAM|Central America|  El Salvador|La Libertad|Antiguo Cuscatlán|
+--------------------------+------+---------------+-------------+-----------+-----------------+
only showing top 5 rows



### **4. Product Table**

In [10]:
product_columns = ["product_id" , "product_category_id",  "product_name", "product_price", "product_status"]
product_table = df.select(product_columns)
product_table = product_table.dropDuplicates() \
                            .orderBy('product_id')
print(product_table.count())
product_table.show(5)


StatementMeta(, 9e60fb1f-1f51-460d-925b-585ee758b758, 12, Finished, Available)

54
+----------+-------------------+--------------------+-------------+--------------+
|product_id|product_category_id|        product_name|product_price|product_status|
+----------+-------------------+--------------------+-------------+--------------+
|        37|                  3|adidas Kids' F5 M...|  34.99000168|             0|
|        44|                  3|adidas Men's F10 ...|  59.99000168|             0|
|        93|                  5|Under Armour Men'...|  24.98999977|             0|
|       116|                  6|Nike Men's Comfor...|  44.99000168|             0|
|       134|                  7|Nike Women's Lege...|         25.0|             0|
+----------+-------------------+--------------------+-------------+--------------+
only showing top 5 rows



### **5. Product Category Table**

In [11]:
category_columns = ["product_category_id", "category_name", "department_id"]
category_table = df.select(category_columns)
category_table = category_table.dropDuplicates() \
                                .orderBy('product_category_id')
print(category_table.count())
category_table.show(5)

StatementMeta(, 9e60fb1f-1f51-460d-925b-585ee758b758, 13, Finished, Available)

25
+-------------------+-------------------+-------------+
|product_category_id|      category_name|department_id|
+-------------------+-------------------+-------------+
|                  3|Baseball & Softball|            2|
|                  5|           Lacrosse|            2|
|                  6|   Tennis & Racquet|            2|
|                  7|             Hockey|            2|
|                  9|   Cardio Equipment|            3|
+-------------------+-------------------+-------------+
only showing top 5 rows



### **6. Department Table**

In [12]:
department_columns = ["department_id", "department_name"]
department_table  = df.select(department_columns)
department_table = department_table.dropDuplicates()
department_table = department_table.orderBy('department_id')
department_table.show()

StatementMeta(, 9e60fb1f-1f51-460d-925b-585ee758b758, 14, Finished, Available)

+-------------+---------------+
|department_id|department_name|
+-------------+---------------+
|            2|        Fitness|
|            3|       Footwear|
|            4|        Apparel|
|            5|           Golf|
|            6|       Outdoors|
|            7|       Fan Shop|
+-------------+---------------+



### **Load to Table (not UTF-8 Format)**

In [15]:
order_status_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.order_status')
delivery_risk.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.delivery_risk')
shipping_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.shipping_mode')
transaction_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.transaction_type')
delivery_status_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.delivery_status')

customer_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.customer')
customer_region_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.customer_region')
desti_order_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.desti_order')
product_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.product')
category_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.category')
department_table.write.format("delta").mode("overwrite").saveAsTable('LTT_SilverLakehouse.department')

StatementMeta(, 9710a5e3-0c3c-43ee-bfc2-9f9bea323ca1, 17, Finished, Available)