In [1]:
from pyspark.sql import functions as F
from service.utils.spark import get_spark_session
spark = get_spark_session(dev=True)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/21 13:18:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
test_namespace = 'gold' 
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {test_namespace}")

DataFrame[]

In [3]:
complete_order_timestamp_df = spark.read.table(f"{test_namespace}.complete_order_timestamp")
complete_order_timestamp_df.show(n=5, truncate=False)

                                                                                

+--------------------------------+-------------------+-------------------+-------------------+-------------------+
|order_id                        |purchase           |approved           |delivered_carrier  |delivered_customer |
+--------------------------------+-------------------+-------------------+-------------------+-------------------+
|a9a93c428c6103f2151bb63a1d32a520|2017-01-14 17:57:50|2017-01-17 10:55:13|2017-01-20 17:58:58|2017-01-30 18:37:44|
|56ef80c564f6fd57cc662adee0379746|2017-01-16 14:24:22|2017-01-16 14:35:17|2017-01-16 15:21:56|2017-01-23 08:56:05|
|f9427374480e37251d5c279ebc41a3ab|2017-01-17 14:57:45|2017-01-18 02:10:16|2017-01-19 09:12:51|2017-01-24 15:14:01|
|d6d7c431275f0029dcc3538850930046|2017-01-19 14:28:48|2017-01-19 14:41:56|2017-01-24 10:05:23|2017-01-31 12:26:08|
|0957ed870116e596b800540427c61497|2017-01-29 22:14:49|2017-01-29 22:33:34|2017-01-30 08:27:47|2017-02-08 17:14:55|
+--------------------------------+-------------------+-------------------+------

In [4]:
ed_df = spark.read.csv("s3a://warehousedev/bronze/tsv/estimated_delivery_date.tsv", header=True, sep='\t')
format_string = 'yyyy-MM-dd HH:mm:ss'
ed_df = ed_df.withColumn('estimated_delivery_date', F.to_timestamp('estimated_delivery_date', format_string))
order_timestamp_with_ed = complete_order_timestamp_df.join(ed_df, on='order_id', how='inner')
order_timestamp_with_ed.show()


25/09/21 13:18:28 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

+--------------------+-------------------+-------------------+-------------------+-------------------+-----------------------+
|            order_id|           purchase|           approved|  delivered_carrier| delivered_customer|estimated_delivery_date|
+--------------------+-------------------+-------------------+-------------------+-------------------+-----------------------+
|a9a93c428c6103f21...|2017-01-14 17:57:50|2017-01-17 10:55:13|2017-01-20 17:58:58|2017-01-30 18:37:44|    2017-02-28 00:00:00|
|56ef80c564f6fd57c...|2017-01-16 14:24:22|2017-01-16 14:35:17|2017-01-16 15:21:56|2017-01-23 08:56:05|    2017-02-24 00:00:00|
|f9427374480e37251...|2017-01-17 14:57:45|2017-01-18 02:10:16|2017-01-19 09:12:51|2017-01-24 15:14:01|    2017-02-23 00:00:00|
|d6d7c431275f0029d...|2017-01-19 14:28:48|2017-01-19 14:41:56|2017-01-24 10:05:23|2017-01-31 12:26:08|    2017-03-13 00:00:00|
|0957ed870116e596b...|2017-01-29 22:14:49|2017-01-29 22:33:34|2017-01-30 08:27:47|2017-02-08 17:14:55|    2017-

In [5]:
delivery_stats = order_timestamp_with_ed \
    .withColumn(
        'lead_time_approve_days',
        F.datediff(F.col('approved'), F.col('purchase'))) \
    .withColumn(
        'lead_time_carrier_days',
        F.datediff(F.col('delivered_carrier'), F.col('approved'))) \
    .withColumn(
        'lead_time_customer_days',
        F.datediff(F.col('delivered_customer'), F.col('delivered_carrier'))) \
    .withColumn(
        'total_delivery_days',
        F.datediff(F.col('delivered_customer'), F.col('purchase'))) \
    .withColumn(
        'is_late',
        F.when(F.col('delivered_customer') <= F.col('estimated_delivery_date'), False)
        .otherwise(True)
    )
delivery_stats = delivery_stats.select(
    'order_id',
    'lead_time_approve_days',
    'lead_time_carrier_days',
    'lead_time_customer_days',
    'total_delivery_days',
    'is_late'
)
delivery_stats.show(n=5)


+--------------------+----------------------+----------------------+-----------------------+-------------------+-------+
|            order_id|lead_time_approve_days|lead_time_carrier_days|lead_time_customer_days|total_delivery_days|is_late|
+--------------------+----------------------+----------------------+-----------------------+-------------------+-------+
|a9a93c428c6103f21...|                     3|                     3|                     10|                 16|  false|
|56ef80c564f6fd57c...|                     0|                     0|                      7|                  7|  false|
|f9427374480e37251...|                     1|                     1|                      5|                  7|  false|
|d6d7c431275f0029d...|                     0|                     5|                      7|                 12|  false|
|0957ed870116e596b...|                     0|                     1|                      9|                 10|  false|
+--------------------+----------

In [6]:
delivery_stats.writeTo(f"{test_namespace}.delivery_stats").using('iceberg').createOrReplace()

                                                                                

In [7]:
spark.stop()