In [25]:
from pyspark.sql import functions as F
from service.utils.spark import get_spark_session
spark = get_spark_session(dev=True)


25/09/23 15:55:23 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [26]:
test_namespace = 'gold' 
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {test_namespace}")

DataFrame[]

In [27]:
delivered_order_timestamp = spark.read.table(f"{test_namespace}.delivered_order_timestamp")
delivered_order_timestamp.show(n=5, truncate=False)

+--------------------------------+-------------------+--------------------------------+-------------------+-------------------+-------------------+-------------------+-----------------------+
|order_id                        |shipping_limit_date|customer_id                     |purchase           |approved           |delivered_carrier  |delivered_customer |estimated_delivery_date|
+--------------------------------+-------------------+--------------------------------+-------------------+-------------------+-------------------+-------------------+-----------------------+
|110f190bf49ab04f3ead4f860570e902|2017-10-09 15:56:20|cc2bef54fa72102dca9777ff483f75a5|2017-10-03 15:46:55|2017-10-03 15:56:20|2017-10-04 16:24:02|2017-10-06 21:48:05|2017-10-20 00:00:00    |
|14129c568e62c007ad8d334de62aca31|2017-11-21 02:46:35|dd246e640bf9e0d31ba62cf5cf2bd1bd|2017-11-14 01:49:03|2017-11-15 02:46:35|2017-11-17 20:18:40|2017-11-22 21:22:08|2017-12-12 00:00:00    |
|b23f8178f3b6555a78a4ad844ed11c57|2018-0

In [28]:
timestamp_stats_wide = delivered_order_timestamp \
    .withColumn(
        'lead_time_approve_days',
        F.datediff(F.col('approved'), F.col('purchase'))) \
    .withColumn(
        'lead_time_carrier_days',
        F.datediff(F.col('delivered_carrier'), F.col('approved'))) \
    .withColumn(
        'lead_time_customer_days',
        F.datediff(F.col('delivered_customer'), F.col('delivered_carrier'))) \
    .withColumn(
        'total_delivery_days',
        F.datediff(F.col('delivered_customer'), F.col('purchase'))) \
    .withColumn(
        'is_late_delivery',
        F.when(F.col('delivered_customer') <= F.col('estimated_delivery_date'), False)
        .otherwise(True)) \
    .withColumn(
        'is_late_shipping',
        F.when(F.col('shipping_limit_date') < F.col('delivered_carrier'), True)
        .otherwise(False))

timestamp_stats_wide = timestamp_stats_wide.select(
    'order_id',
    'lead_time_approve_days',
    'lead_time_carrier_days',
    'lead_time_customer_days',
    'total_delivery_days',
    'is_late_delivery',
    'is_late_shipping'
)

In [29]:
timestamp_stats_wide.show(n=5,truncate=False)

+--------------------------------+----------------------+----------------------+-----------------------+-------------------+----------------+----------------+
|order_id                        |lead_time_approve_days|lead_time_carrier_days|lead_time_customer_days|total_delivery_days|is_late_delivery|is_late_shipping|
+--------------------------------+----------------------+----------------------+-----------------------+-------------------+----------------+----------------+
|110f190bf49ab04f3ead4f860570e902|0                     |1                     |2                      |3                  |false           |false           |
|14129c568e62c007ad8d334de62aca31|1                     |2                     |5                      |8                  |false           |false           |
|b23f8178f3b6555a78a4ad844ed11c57|1                     |2                     |3                      |6                  |false           |false           |
|232208723745063b04b8e45bba7fb6bd|1           

In [30]:
delivered_order_timestamp.filter(F.col('order_id') == '00010242fe8c5a6d1ba2dd792cb16214').show()

+--------------------+-------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-----------------------+
|            order_id|shipping_limit_date|         customer_id|           purchase|           approved|  delivered_carrier| delivered_customer|estimated_delivery_date|
+--------------------+-------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-----------------------+
|00010242fe8c5a6d1...|2017-09-19 09:45:35|871766c5855e863f6...|2017-09-13 08:59:02|2017-09-13 09:45:35|2017-09-19 18:34:16|2017-09-20 23:43:48|    2017-09-29 00:00:00|
+--------------------+-------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-----------------------+



In [31]:
timestamp_stats_wide.writeTo(f"{test_namespace}.timestamp_stats_wide").using('iceberg').createOrReplace()

                                                                                

In [32]:
timestamp_stats_wide.show(1)

+--------------------+----------------------+----------------------+-----------------------+-------------------+----------------+----------------+
|            order_id|lead_time_approve_days|lead_time_carrier_days|lead_time_customer_days|total_delivery_days|is_late_delivery|is_late_shipping|
+--------------------+----------------------+----------------------+-----------------------+-------------------+----------------+----------------+
|110f190bf49ab04f3...|                     0|                     1|                      2|                  3|           false|           false|
+--------------------+----------------------+----------------------+-----------------------+-------------------+----------------+----------------+
only showing top 1 row



In [33]:
# long type
from pyspark.sql.functions import expr

timestamp_stats_long = timestamp_stats_wide.select(
    "order_id",
    'is_late_delivery',
    'is_late_shipping',
    expr("""
        stack(3,
            'until_approve', lead_time_approve_days,
            'until_carrier', lead_time_carrier_days,
            'until_customer', lead_time_customer_days
        ) as (lead_time_type, lead_time_days)
    """)
)

In [34]:

timestamp_stats_long.show()

+--------------------+----------------+----------------+--------------+--------------+
|            order_id|is_late_delivery|is_late_shipping|lead_time_type|lead_time_days|
+--------------------+----------------+----------------+--------------+--------------+
|110f190bf49ab04f3...|           false|           false| until_approve|             0|
|110f190bf49ab04f3...|           false|           false| until_carrier|             1|
|110f190bf49ab04f3...|           false|           false|until_customer|             2|
|14129c568e62c007a...|           false|           false| until_approve|             1|
|14129c568e62c007a...|           false|           false| until_carrier|             2|
|14129c568e62c007a...|           false|           false|until_customer|             5|
|b23f8178f3b6555a7...|           false|           false| until_approve|             1|
|b23f8178f3b6555a7...|           false|           false| until_carrier|             2|
|b23f8178f3b6555a7...|           false|    

In [35]:
timestamp_stats_long.writeTo(f"{test_namespace}.timestamp_stats_long").using('iceberg').createOrReplace()

                                                                                

In [36]:
spark.stop()