In [1]:
from pyspark.sql.functions import sum, col, concat
from pyspark.sql.functions import to_timestamp, date_format
from pyspark.sql.functions import desc, last_day, rand
import pandas as pd
from pyspark.sql.functions import col, datediff, coalesce, lit
import math
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from pyspark.sql.types import StringType, IntegerType, FloatType

StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 3, Finished, Available)

## **Import Dim Tables**

In [2]:
dim_customer = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_customer")
dim_product = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_product")
dim_category = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_category")
dim_department = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_department")

dim_delivery_status = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_delivery_status")
dim_delivery_risk = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_delivery_risk")
dim_shipping = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_shipping_mode")
dim_transaction = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_transaction_type")
dim_order_status = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_order_status")

dim_date = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_date")
dim_time = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_time")

dim_store = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_store")
dim_destination = spark.sql("SELECT * FROM LTT_GoldLakehouse.dim_destination")

fact_sales = spark.sql("SELECT * FROM LTT_GoldLakehouse.fact_sales")

StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 4, Finished, Available)

In [3]:
display(spark.sql('describe history dim_customer'))
# display(spark.sql('refresh table dim_customer'))


StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 5, Finished, Available)

SynapseWidget(Synapse.DataFrame, 3e73fbbd-1dcb-4366-8858-33c85a15593b)

## **Delivery Distance**

In [4]:
dim_destination.createOrReplaceTempView('dim_destination')
dim_store.createOrReplaceTempView('dim_store')
print(dim_destination.count())
print(dim_store.count())
display(dim_store.head(5))
display(dim_destination.head(5))

StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 6, Finished, Available)

312
1646


SynapseWidget(Synapse.DataFrame, 4d1be872-d056-4f94-a02d-eb89fba9fc5a)

SynapseWidget(Synapse.DataFrame, a77bede9-4839-448e-aa4c-878fd9ec5587)

In [5]:
import math

def haversine_distance(lat1, lon1, lat2, lon2):
    if None in (lat1, lon1, lat2, lon2):
        return None
    
    # Radius of the Earth in kilometers
    R = 6371.0
    
    # Convert latitude and longitude from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)
    
    # Differences in coordinates
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    
    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    
    # Distance in kilometers
    distance = R * c
    return distance



StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 7, Finished, Available)

In [6]:
fact_sales.createOrReplaceTempView("fact_sales")
dim_date.createOrReplaceTempView("dim_date")
dim_product.createOrReplaceTempView("dim_product")
dim_destination.createOrReplaceTempView("dim_destination")
dim_store.createOrReplaceTempView("dim_store")
dim_time.createOrReplaceTempView("dim_time")

# fact_sales.show(2)
# dim_date.show(2)

StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 8, Finished, Available)

In [7]:
# Perform the query operation
delivery_performance = spark.sql("""
    select
        fs.order_id,
        fs.product_key,
        fs.store_key,
        fs.desti_key,
        fs.delivery_status_key,
        fs.shipping_mode_key,
        d1.date_key as shipping_date,
        d3.time_key as order_time,
        d2.date_key as order_date,
        d4.time_key as shipping_time,
        datediff(d1.date, d2.date) as delivery_days
    from fact_sales fs
    left join dim_date d1
        on fs.shipping_date_key = d1.date_key
    left join dim_date d2
        on fs.order_date_key = d2.date_key
    left join dim_time d3
        on fs.order_time_key = d3.time_key
    left join dim_time d4
        on fs.shipping_time_key = d4.time_key
""")

delivery_performance.createOrReplaceTempView('delivery_performance')

delivery_performance = spark.sql("""
    select dp.order_id, dp.product_key, dp.delivery_status_key, dp.order_date, dp.order_time, dp.shipping_date, dp.shipping_time, dp.shipping_mode_key, dp.delivery_days, dd1.store_key, dd2.desti_key, dd1.latitude AS lat1, dd1.longitude AS lon1, dd2.latitude AS lat2, dd2.longitude AS lon2
    from delivery_performance dp
    left join dim_store dd1
    on dp.store_key = dd1.store_key
    left join dim_destination dd2
    on dp.desti_key = dd2.desti_key
""")

StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 9, Finished, Available)

In [8]:
display(delivery_performance.head(5))

StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 10, Finished, Available)

SynapseWidget(Synapse.DataFrame, bb24cf38-2fdb-4552-9e83-5a37509cbb3c)

In [9]:
haversine_udf = udf(haversine_distance, DoubleType())
delivery_performance = delivery_performance.withColumn('distance_km', \
                                haversine_udf(delivery_performance['lat1'], delivery_performance['lon1'], delivery_performance['lat2'], delivery_performance['lon2']))

display(delivery_performance.head(5))

StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 11, Finished, Available)

SynapseWidget(Synapse.DataFrame, 4c7c57b1-5a0a-4464-b395-9f709b4f9d94)

In [10]:
delivery_performance = delivery_performance.select('order_id', 'product_key', 'delivery_status_key', 'order_date', 'order_time', 'shipping_date', 'shipping_time', 'shipping_mode_key', 'store_key', 'desti_key', 'delivery_days', 'distance_km')
# Display the resulting DataFrame

display(delivery_performance.head(10))

StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 12, Finished, Available)

SynapseWidget(Synapse.DataFrame, 1ee79a6b-f931-4c43-9d34-8a6d0fb1f161)

In [11]:
print(delivery_performance[delivery_performance['delivery_days'].isNull()].count())
print(delivery_performance[delivery_performance['distance_km'].isNull()].count())
delivery_performance = delivery_performance.withColumn('delivery_days', coalesce('delivery_days', lit(0)))
delivery_performance = delivery_performance.withColumn('distance_km', coalesce('distance_km', lit(0)))

StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 13, Finished, Available)

0
2797


> Distance Performance

In [12]:
def categorize_distance(distance):
    if distance <= 500:
        return '0-500 km'
    elif distance <= 2000:
        return '500-2000 km'
    elif distance <= 5000:
        return '2000-5000 km'
    else:
        return ' >5000 km'

# def evaluate_performance(distance, delivery_days):
#     if distance <= 500 and delivery_days <= 1:
#         return 'On Time'
#     elif distance <= 2000 and delivery_days <= 3:
#         return 'On Time'
#     elif distance <= 5000 and delivery_days <= 5:
#         return 'On Time'
#     elif distance > 5000 and delivery_days <= 7:
#         return 'On Time'
#     else:
#         return 'Late'

categorize_distance_udf = udf(categorize_distance, StringType())
# evaluate_performance_udf = udf(evaluate_performance, StringType())


StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 14, Finished, Available)

In [13]:
# apply udfs
delivery_performance_with_categories = delivery_performance.withColumn("distance_category", categorize_distance_udf(col("distance_km")))
# delivery_performance_with_categories = delivery_performance.withColumn("delivery_performance", evaluate_performance_udf(col("distance_km"), col("delivery_days")))


StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 15, Finished, Available)

In [14]:
delivery_performance_with_categories.createOrReplaceTempView('delivery_performance_with_categories')

StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 16, Finished, Available)

In [15]:

display(delivery_performance_with_categories.printSchema())

StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 17, Finished, Available)

root
 |-- order_id: integer (nullable = true)
 |-- product_key: integer (nullable = true)
 |-- delivery_status_key: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_time: string (nullable = true)
 |-- shipping_date: string (nullable = true)
 |-- shipping_time: string (nullable = true)
 |-- shipping_mode_key: integer (nullable = true)
 |-- store_key: integer (nullable = true)
 |-- desti_key: integer (nullable = true)
 |-- delivery_days: integer (nullable = false)
 |-- distance_km: double (nullable = false)
 |-- distance_category: string (nullable = true)



In [16]:
display(delivery_performance_with_categories.filter(col('delivery_days').isNull()))

StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 18, Finished, Available)

SynapseWidget(Synapse.DataFrame, 64a2bce7-3c17-4b53-82fa-8d016ab28071)

In [17]:
delivery_performance_with_categories.createOrReplaceTempView('delivery_performance_with_categories')
agg_delivery_performance = spark.sql("""
    select desti_key,
        shipping_date,
        shipping_time,
        distance_category,
        delivery_days,
        count(dp.order_id) as total_deliveries,
        sum(case when dds.delivery_status = 'Advance shipping' then 1 else 0 end) as advanced_deliveries,
        sum(case when dds.delivery_status = 'Late delivery' then 1 else 0 end) as lated_deliveries,
        sum(case when dds.delivery_status = 'Shipping canceled' then 1 else 0 end) as canceled_deliveries, 
        sum(case when dds.delivery_status = 'Shipping on time' then 1 else 0 end) as on_time_deliveries,
        avg(delivery_days) as avg_delivery_days,
        sum(case when delivery_days <= 1 then 1 else 0 end) as deliveries_within_1_day,
        sum(case when delivery_days <= 3 then 1 else 0 end) as deliveries_within_3_days,
        sum(case when delivery_days <= 7 then 1 else 0 end) as deliveries_within_7_days,
        sum(case when delivery_days > 7 then 1 else 0 end) as deliveries_more_than_7_days
    from delivery_performance_with_categories as dp
    left join dim_delivery_status as dds
    on dp.delivery_status_key = dds.delivery_status_key
    group by desti_key, shipping_date, shipping_time, distance_category, delivery_days
""")


display(agg_delivery_performance.head(10))

StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 19, Finished, Available)

SynapseWidget(Synapse.DataFrame, 06de79d8-285b-460b-a41b-1069c2b509c5)

In [18]:
# display(agg_delivery_performance)
# Took the first month by order_date, so we have shipping month is 2

StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 20, Finished, Available)

## **Load into GoldLakehouse**

In [19]:
delivery_performance_with_categories.write.format('delta').mode('overwrite').option('overwriteSchema', 'true').saveAsTable('LTT_GoldLakehouse.fact_tran_distance')
agg_delivery_performance.write.format('delta').mode('overwrite').option('overwriteSchema', 'true').saveAsTable('LTT_GoldLakehouse.fact_agg_delivery_performance')

StatementMeta(, f5b0380d-5599-4ac9-8408-ed5a776d6933, 21, Finished, Available)