In [None]:
from pyspark.sql.functions import sum, col, concat
from pyspark.sql.functions import to_timestamp, date_format
from pyspark.sql.functions import desc, last_day, lpad, when
from pyspark.sql import functions as F

import math
from pyspark.sql.functions import udf, coalesce, lit
from pyspark.sql.types import DoubleType
from pyspark.sql.types import StringType, IntegerType, FloatType

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 3, Finished, Available)

## **Import Tables from Gold Lakehouse**

In [None]:
dim_customer = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_customer")
dim_product = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_product")
dim_category = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_category")
dim_department = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_department")

dim_store = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_store")
dim_destination = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_destination")

dim_order_status = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_order_status")
dim_delivery_risk = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_delivery_risk")
dim_delivery_status = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_delivery_status")
dim_shipping = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_shipping_mode")
dim_transaction_type = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_transaction_type")

dim_date = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_date")
dim_time = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_time")

orders = spark.sql("SELECT * FROM LTT_SilverLakehouse.orders")

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 4, Finished, Available)

In [None]:
print("Number of old records")
print(f"dim_order_status Count: {dim_order_status.count()}")
print(f"dim_delivery_risk Count: {dim_delivery_risk.count()}")
print(f"dim_shipping Count: {dim_shipping.count()}")
print(f"dim_transaction_type Count: {dim_transaction_type.count()}")
print(f"dim_delivery_status Count: {dim_delivery_status.count()}")

print(f"dim_customer Count: {dim_customer.count()}")
print(f"dim_store Count: {dim_store.count()}")
print(f"dim_destination Count: {dim_destination.count()}")
print(f"dim_product Count: {dim_product.count()}")
print(f"dim_category Count: {dim_category.count()}")
print(f"dim_department Count: {dim_department.count()}")

print(f"dim_date Count: {dim_date.count()}")
print(f"dim_time Count: {dim_time.count()}")

print(f"orders Count: {orders.count()}")

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 5, Finished, Available)

Number of old records
dim_order_status Count: 9
dim_delivery_risk Count: 2
dim_shipping Count: 4
dim_transaction_type Count: 4
dim_delivery_status Count: 4
dim_customer Count: 18526
dim_store Count: 11999
dim_destination Count: 3771
dim_product Count: 118
dim_category Count: 51
dim_department Count: 11
dim_date Count: 3653
dim_time Count: 86400
orders Count: 178393


## **Merge Orders with Mapping Tables**

In [None]:
orders.select('order_id')
# mapping tables
orders.select('order_status', 'late_delivery_risk', 'shipping_mode','transaction_type', 'delivery_status')
# location tables
orders.select('concat_destination_address', 'concat_customer_region')
# scd tables
orders.select('customer_id', 'product_id')
# other columns
orders.select('order_item_quantity', 'order_item_discount', 'order_date', 'shipping_date', 'sales', 'benefit_per_order')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 6, Finished, Available)

DataFrame[order_item_quantity: int, order_item_discount: double, order_date: timestamp, shipping_date: timestamp, sales: double, benefit_per_order: double]

In [None]:
display(orders.select('shipping_mode', 'delivery_status', 'transaction_type', 'order_status', 'late_delivery_risk').head(5))

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 7, Finished, Available)

SynapseWidget(Synapse.DataFrame, 30b19e96-e064-4bb2-969f-8466c5cbcf12)

In [None]:
orders = orders.join(
    dim_order_status,
    orders['order_status'] == dim_order_status['order_status']
).drop(orders['order_status'], dim_order_status['order_status'])

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 8, Finished, Available)

In [None]:
orders = orders.join(
    dim_shipping,
    orders['shipping_mode'] == dim_shipping['shipping_mode']
).drop(orders['shipping_mode'], dim_shipping['shipping_mode'])

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 9, Finished, Available)

In [None]:
orders = orders.join(
    dim_transaction_type,
    orders['transaction_type'] == dim_transaction_type['transaction_type']
).drop(orders['transaction_type'], dim_transaction_type['transaction_type'])

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 10, Finished, Available)

In [None]:
orders = orders.join(
    dim_delivery_status,
    orders['delivery_status'] == dim_delivery_status['delivery_status']
).drop(orders['delivery_status'], dim_delivery_status['delivery_status'])

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 11, Finished, Available)

In [None]:
orders = orders.withColumn("delivery_risk_name",
                            when(orders["late_delivery_risk"] == 1, "is late")
                            .when(orders["late_delivery_risk"] == 0, "not late")
                            .otherwise("unknown"))
orders = orders.drop('late_delivery_risk')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 12, Finished, Available)

In [None]:
orders = orders.join(
    dim_delivery_risk,
    orders['delivery_risk_name'] == dim_delivery_risk['delivery_risk_name']
).drop(orders['delivery_risk_name'], dim_delivery_risk['delivery_risk_name'])

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 13, Finished, Available)

In [None]:
display(orders.head(5))

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 14, Finished, Available)

SynapseWidget(Synapse.DataFrame, 423114c3-b992-476c-8511-b5b46f68bcbb)

## **Merge Orders with Customer Table**

In [None]:
# Joining orders with dim_customer
orders = orders.join(dim_customer, dim_customer['customer_id'] == orders['customer_id'])

# Dropping unnecessary columns in orders DataFrame
orders = orders.drop('customer_id', 'customer_fname', 'customer_lname', 'customer_segment', 'valid_from', 'valid_to', 'is_valid')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 15, Finished, Available)

In [None]:
orders.count()

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 16, Finished, Available)

178393

## **Merge orders with location dimension tables**

In [None]:
# Reference `concat_destination_address` is ambiguous, so I specify the table name and rename it
orders = orders.withColumnRenamed('concat_destination_address', 'destination_address')
orders = orders.withColumnRenamed('concat_customer_region', 'customer_region')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 17, Finished, Available)

In [None]:
# Joining orders with dim_destination
orders = orders.join(dim_destination, dim_destination['concat_destination_address'] == orders['destination_address'])
# Dropping unnecessary columns and printing the schema
orders = orders.drop('destination_address', 'desti_city', 'desti_state', 'desti_country', 'desti_region', 'desti_market', 'concat_destination_address')
# dim_destination = dim_destination.drop('concat_destination_address')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 18, Finished, Available)

In [None]:
orders.count()

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 19, Finished, Available)

178393

In [None]:
# Joining orders with dim_store
orders = orders.join(dim_store, dim_store['concat_customer_region'] == orders['customer_region'])

# Dropping unnecessary columns in orders DataFrame
orders = orders.drop('customer_region', 'store_country', 'store_state', 'store_city', 'store_name', 'concat_customer_region')
# dim_store = dim_store.drop('concat_customer_region')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 20, Finished, Available)

In [None]:
orders.count()

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 21, Finished, Available)

178393

## **Merge orders with hierarchical dimension tables**

In [None]:
# Joining orders with dim_product
orders = orders.join(dim_product, dim_product['product_id'] == orders['product_id'])

# Dropping unnecessary columns in orders DataFrame
orders = orders.drop('product_id', 'product_category_id', 'product_name', 'product_price', 'product_status', 'valid_from', 'valid_to', 'is_valid')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 22, Finished, Available)

In [None]:
orders.count()

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 23, Finished, Available)

178393

In [None]:
# Joining dim_product with dim_category
dim_product = dim_product.join(dim_category, dim_product['product_category_id'] == dim_category['product_category_id'])

# Dropping unnecessary columns in orders DataFrame
dim_product = dim_product.drop('product_category_id', 'category_name', 'department_id', 'valid_from', 'valid_to', 'is_valid')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 24, Finished, Available)

In [None]:
# Joining dim_product with dim_category
dim_category = dim_category.join(dim_department, dim_category['department_id'] == dim_department['department_id'])

# Dropping unnecessary columns in orders DataFrame
dim_category = dim_category.drop('department_id', 'department_name', 'valid_from', 'valid_to', 'is_valid')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 25, Finished, Available)

In [None]:
orders.printSchema()

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 26, Finished, Available)

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- shipping_date: timestamp (nullable = true)
 |-- order_item_quantity: integer (nullable = true)
 |-- order_item_discount: double (nullable = true)
 |-- sales: double (nullable = true)
 |-- benefit_per_order: double (nullable = true)
 |-- order_status_key: integer (nullable = true)
 |-- shipping_mode_key: integer (nullable = true)
 |-- transaction_type_key: integer (nullable = true)
 |-- delivery_status_key: integer (nullable = true)
 |-- abbreviated_status: string (nullable = true)
 |-- delivery_risk_key: integer (nullable = true)
 |-- customer_key: integer (nullable = true)
 |-- desti_key: integer (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- store_key: integer (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- product_key: integer (nullable = true)



In [None]:
orders.count()

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 27, Finished, Available)

178393

In [None]:
# overwrite the orders after mapping to other tables
# to compare when incremental load
# orders.write.format('delta').mode('overwrite').saveAsTable('LTT_SilverLakehouse.orders')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 28, Finished, Available)

## **Merge orders with timestamp tables**

In [None]:
# Merge shipping_date_key and shipping_time_key into orders
orders = orders.withColumn('shipping_time_key', date_format('shipping_date', 'hhmmss'))
orders = orders.withColumn('shipping_date', date_format('shipping_date', 'yyyyMMdd'))
orders = orders.withColumnRenamed('shipping_date', 'shipping_date_key')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 29, Finished, Available)

In [None]:
# Merge order_date_key and shipping_time_key into orders
orders = orders.withColumn('order_time_key', date_format('order_date', 'hhmmss'))
orders = orders.withColumn('order_date', date_format('order_date', 'yyyyMMdd'))
orders = orders.withColumnRenamed('order_date', 'order_date_key')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 30, Finished, Available)

In [None]:
orders.printSchema()

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 31, Finished, Available)

root
 |-- order_id: integer (nullable = true)
 |-- order_date_key: string (nullable = true)
 |-- shipping_date_key: string (nullable = true)
 |-- order_item_quantity: integer (nullable = true)
 |-- order_item_discount: double (nullable = true)
 |-- sales: double (nullable = true)
 |-- benefit_per_order: double (nullable = true)
 |-- order_status_key: integer (nullable = true)
 |-- shipping_mode_key: integer (nullable = true)
 |-- transaction_type_key: integer (nullable = true)
 |-- delivery_status_key: integer (nullable = true)
 |-- abbreviated_status: string (nullable = true)
 |-- delivery_risk_key: integer (nullable = true)
 |-- customer_key: integer (nullable = true)
 |-- desti_key: integer (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- store_key: integer (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- product_key: integer (nullable = true)
 |-- shipping_time_key: string 

In [None]:
orders = orders.withColumn("order_date", F.to_date(F.col("order_date_key"), "yyyyMMdd"))
orders = orders.withColumn("shipping_date", F.to_date(F.col("shipping_date_key"), "yyyyMMdd"))

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 32, Finished, Available)

In [None]:
orders.printSchema()

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 33, Finished, Available)

root
 |-- order_id: integer (nullable = true)
 |-- order_date_key: string (nullable = true)
 |-- shipping_date_key: string (nullable = true)
 |-- order_item_quantity: integer (nullable = true)
 |-- order_item_discount: double (nullable = true)
 |-- sales: double (nullable = true)
 |-- benefit_per_order: double (nullable = true)
 |-- order_status_key: integer (nullable = true)
 |-- shipping_mode_key: integer (nullable = true)
 |-- transaction_type_key: integer (nullable = true)
 |-- delivery_status_key: integer (nullable = true)
 |-- abbreviated_status: string (nullable = true)
 |-- delivery_risk_key: integer (nullable = true)
 |-- customer_key: integer (nullable = true)
 |-- desti_key: integer (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- store_key: integer (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- product_key: integer (nullable = true)
 |-- shipping_time_key: string 

In [None]:
orders = orders.withColumnRenamed('order_time_key', 'order_time')
orders = orders.withColumnRenamed('shipping_time_key','shipping_time')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 34, Finished, Available)

## **Create a base fact which is relevant to orders**

In [None]:
fact_sales =  orders.select('order_id', \
                            'order_date', 'order_time', \
                            'product_key', 'customer_key', \
                            'store_key', 'desti_key', \
                            'transaction_type_key', 'order_status_key', \
                            'order_item_discount', 'order_item_quantity', \
                            'sales', 'benefit_per_order' \
                            ).orderBy('order_id')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 35, Finished, Available)

In [None]:
display(fact_sales.head(5))

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 36, Finished, Available)

SynapseWidget(Synapse.DataFrame, ec851735-0408-48d8-83fc-1f2305bda7a1)

In [None]:
fact_sales.count()

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 37, Finished, Available)

178393

## **Create a fact related to delivery process**

In [None]:
import math

def haversine_distance(lat1, lon1, lat2, lon2):
    if None in (lat1, lon1, lat2, lon2):
        return None
    
    # Radius of the Earth in kilometers
    R = 6371.0
    
    # Convert latitude and longitude from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)
    
    # Differences in coordinates
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    
    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    
    # Distance in kilometers
    distance = R * c
    return distance

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 38, Finished, Available)

In [None]:
fact_sales.createOrReplaceTempView("fact_sales")
orders.createOrReplaceTempView("orders")
dim_date.createOrReplaceTempView("dim_date")
dim_product.createOrReplaceTempView("dim_product")
dim_destination.createOrReplaceTempView("dim_destination")
dim_store.createOrReplaceTempView("dim_store")
dim_time.createOrReplaceTempView("dim_time")


StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 39, Finished, Available)

In [None]:
# d1.date_key as shipping_date,
        # d3.time_key as order_time,
        # d2.date_key as order_date,
        # d4.time_key as shipping_time,
        # datediff(d1.date, d2.date) as delivery_days

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 40, Finished, Available)

In [None]:
# Perform the query operation
delivery_performance = spark.sql("""
    select
        fs.order_id,
        fs.order_date,
        fs.order_time,
        fs.shipping_date,
        fs.shipping_time,
        datediff(d1.date, d2.date) as delivery_days,
        fs.store_key,
        fs.desti_key,
        fs.delivery_risk_key,
        fs.delivery_status_key,
        fs.shipping_mode_key
    from orders fs
    left join dim_date d1
        on fs.shipping_date = d1.date
    left join dim_date d2
        on fs.order_date = d2.date
    left join dim_time d3
        on fs.order_time = d3.time_key
    left join dim_time d4
        on fs.shipping_time = d4.time_key
""")

delivery_performance.createOrReplaceTempView('delivery_performance')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 41, Finished, Available)

In [None]:
delivery_performance = spark.sql("""
    select dp.order_id, dp.order_date, dp.order_time, dp.shipping_date, dp.shipping_time, dp.delivery_days, dp.store_key, dp.desti_key, dp.delivery_risk_key, dp.delivery_status_key, dp.shipping_mode_key, dd1.latitude AS lat1, dd1.longitude AS lon1, dd2.latitude AS lat2, dd2.longitude AS lon2
    from delivery_performance dp
    left join dim_store dd1
    on dp.store_key = dd1.store_key
    left join dim_destination dd2
    on dp.desti_key = dd2.desti_key
""")
print(delivery_performance.count())
display(delivery_performance.head(5))

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 42, Finished, Available)

178393


SynapseWidget(Synapse.DataFrame, 07d2889e-7eb7-49b6-b38f-4c28e43c848a)

In [None]:
haversine_udf = udf(haversine_distance, DoubleType())
delivery_performance = delivery_performance.withColumn('distance_km', \
                                haversine_udf(delivery_performance['lat1'], delivery_performance['lon1'], delivery_performance['lat2'], delivery_performance['lon2']))

display(delivery_performance.head(5))

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 43, Finished, Available)

SynapseWidget(Synapse.DataFrame, 62686303-cf53-4964-984b-0e441784878d)

In [None]:
print(delivery_performance[delivery_performance['delivery_days'].isNull()].count())
print(delivery_performance[delivery_performance['distance_km'].isNull()].count())
delivery_performance = delivery_performance.withColumn('delivery_days', coalesce('delivery_days', lit(0)))
delivery_performance = delivery_performance.withColumn('distance_km', coalesce('distance_km', lit(0)))

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 44, Finished, Available)

0
361


In [None]:
display(delivery_performance.head(5))

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 45, Finished, Available)

SynapseWidget(Synapse.DataFrame, 1659ef30-8329-4eb1-ae94-189dd7816771)

In [None]:
# display(delivery_performance.withColumn('distance_km', coalesce('distance_km', lit(0))).filter(col('desti_key') == 3574).head(5))

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 46, Finished, Available)

In [None]:
display(dim_destination.filter(col('desti_key')==3574))

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 47, Finished, Available)

SynapseWidget(Synapse.DataFrame, ea201cdb-640f-4e12-baf4-434c48212eb0)

In [None]:
def categorize_distance(distance):
    if distance <= 500:
        return '0-500 km'
    elif distance <= 2000:
        return '500-2000 km'
    elif distance <= 5000:
        return '2000-5000 km'
    else:
        return ' >5000 km'

categorize_distance_udf = udf(categorize_distance, StringType())

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 48, Finished, Available)

In [None]:
# apply udfs
delivery_performance_with_categories = delivery_performance.withColumn("distance_category", categorize_distance_udf(col("distance_km")))

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 49, Finished, Available)

In [None]:
delivery_performance_with_categories.printSchema()

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 50, Finished, Available)

root
 |-- order_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_time: string (nullable = true)
 |-- shipping_date: date (nullable = true)
 |-- shipping_time: string (nullable = true)
 |-- delivery_days: integer (nullable = false)
 |-- store_key: integer (nullable = true)
 |-- desti_key: integer (nullable = true)
 |-- delivery_risk_key: integer (nullable = true)
 |-- delivery_status_key: integer (nullable = true)
 |-- shipping_mode_key: integer (nullable = true)
 |-- lat1: double (nullable = true)
 |-- lon1: double (nullable = true)
 |-- lat2: double (nullable = true)
 |-- lon2: double (nullable = true)
 |-- distance_km: double (nullable = false)
 |-- distance_category: string (nullable = true)



In [None]:
fact_delivery_performance = delivery_performance_with_categories.select('order_id', 'order_date', 'order_time', 'shipping_date', 'shipping_time', 'delivery_days', 'store_key', 'desti_key', 'distance_km', 'distance_category', 'delivery_risk_key', 'delivery_status_key', 'shipping_mode_key')
# Display the resulting DataFrame
display(fact_delivery_performance.head(10))

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 51, Finished, Available)

SynapseWidget(Synapse.DataFrame, 52dab7e0-8dab-4019-823e-a6624ee9444b)

## **Fact sales and fact delivery performance**

In [None]:
fact_delivery_performance = fact_delivery_performance.orderBy('order_id')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 53, Finished, Available)

In [None]:
# fact_sales = fact_sales.withColumnRenamed('order_id', 'order_id_1')

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 54, Finished, Available)

In [None]:
fact_delivery_performance.printSchema()

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 63, Finished, Available)

root
 |-- order_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_time: string (nullable = true)
 |-- shipping_date: date (nullable = true)
 |-- shipping_time: string (nullable = true)
 |-- delivery_days: integer (nullable = false)
 |-- store_key: integer (nullable = true)
 |-- desti_key: integer (nullable = true)
 |-- distance_km: double (nullable = false)
 |-- distance_category: string (nullable = true)
 |-- delivery_risk_key: integer (nullable = true)
 |-- delivery_status_key: integer (nullable = true)
 |-- shipping_mode_key: integer (nullable = true)



In [None]:
fact_sales.printSchema()

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 64, Finished, Available)

root
 |-- order_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_time: string (nullable = true)
 |-- product_key: integer (nullable = true)
 |-- customer_key: integer (nullable = true)
 |-- store_key: integer (nullable = true)
 |-- desti_key: integer (nullable = true)
 |-- transaction_type_key: integer (nullable = true)
 |-- order_status_key: integer (nullable = true)
 |-- order_item_discount: double (nullable = true)
 |-- order_item_quantity: integer (nullable = true)
 |-- sales: double (nullable = true)
 |-- benefit_per_order: double (nullable = true)



In [None]:
fact_aggregation = fact_sales.join(
    fact_delivery_performance,
    (fact_sales['order_id'] == fact_delivery_performance['order_id']) &
    (fact_sales['store_key'] != fact_delivery_performance['store_key']) &
    (fact_sales['desti_key'] == fact_delivery_performance['desti_key']) &
    (fact_sales['order_date'] == fact_delivery_performance['order_date']) &
    (fact_sales['order_time'] == fact_delivery_performance['order_time'])
)

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 73, Finished, Available)

In [None]:
fact_aggregation.printSchema()

StatementMeta(, , , Waiting, )

root
 |-- order_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_time: string (nullable = true)
 |-- product_key: integer (nullable = true)
 |-- customer_key: integer (nullable = true)
 |-- store_key: integer (nullable = true)
 |-- desti_key: integer (nullable = true)
 |-- transaction_type_key: integer (nullable = true)
 |-- order_status_key: integer (nullable = true)
 |-- order_item_discount: double (nullable = true)
 |-- order_item_quantity: integer (nullable = true)
 |-- sales: double (nullable = true)
 |-- benefit_per_order: double (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_time: string (nullable = true)
 |-- shipping_date: date (nullable = true)
 |-- shipping_time: string (nullable = true)
 |-- delivery_days: integer (nullable = true)
 |-- store_key: integer (nullable = true)
 |-- desti_key: integer (nullable = true)
 |-- distance_km: double (nullable = true)
 |-- distance_category: 

In [None]:
display(fact_aggregation)

StatementMeta(, 67dcf650-86ac-4780-b24f-1e97ef2f3628, 74, Finished, Available)

SynapseWidget(Synapse.DataFrame, a6382253-f67f-41ef-a66d-6dc147139c40)

## **Load into GoldLakehouse**

In [None]:
fact_sales.write.format('delta').mode('overwrite').option('overwriteSchema', "true").saveAsTable('LTT_GoldLakehouse.fact_sales')
fact_delivery_performance.write.format('delta').mode('overwrite').option('overwriteSchema', "true").saveAsTable('LTT_GoldLakehouse.fact_delivery_performance')

StatementMeta(, , , Cancelled, )