In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from service.utils.spark import get_spark_session
# spark = SparkSession.builder.getOrCreate()
spark = get_spark_session(dev=True)
test_namespace = 'warehousedev.silver.test'
spark.conf.get('spark.sql.catalog.warehousedev.s3.region')
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {test_namespace}")

DataFrame[]

In [3]:
def write_iceberg(df, full_table_identifier):
    writer = \
        df.writeTo(full_table_identifier)

    if not spark.catalog.tableExists(full_table_identifier):
        writer.create()
    else:
        writer.overwritePartitions()

In [None]:
# silver
order_status_df = spark.read.csv("s3a://warehousedev/bronze/tsv/order_status.tsv", header=True, sep='\t')

delivered_customer_order_id = order_status_df.filter(order_status_df.status == 'delivered_customer').select(['order_id'])
delivered_customer_order_df = order_status_df.join(delivered_customer_order_id, on='order_id', how='inner')
status_list = ['purchase', 'delivered_customer']
delivered_customer_order_df = delivered_customer_order_df.filter(F.col('status').isin(status_list))

pivoted_df = delivered_customer_order_df.groupBy('order_id') \
    .pivot('status', ['purchase', 'delivered_customer']) \
    .agg(F.first('timestamp'))

delivered_order_df = pivoted_df.withColumnsRenamed({
    'purchase': 'purchase_date',
    'delivered_customer': 'delivery_date'
})
    
ed_df = spark.read.csv("s3a://warehousedev/bronze/tsv/estimated_delivery_date.tsv", header=True, sep='\t')
delivered_order_df = delivered_order_df.join(ed_df, on='order_id', how='inner')
payment_df = spark.read.csv("s3a://warehousedev/bronze/tsv/payment.tsv", header=True, sep='\t')
payment_info_df = payment_df.select(['order_id', 'customer_id']).dropDuplicates()

# payment와 order_status에 모두 있는 order_id만 남김
clean_order_df = delivered_order_df.join(payment_info_df, on='order_id', how='inner')

format_string1 = 'yyyy-MM-dd HH:mm:ss.SSSSSS'
format_string2 = 'yyyy-MM-dd HH:mm:ss'
clean_order_df = clean_order_df.withColumns({
    'purchase_date': F.to_timestamp(F.col('purchase_date'), format_string1),
    'delivery_date': F.to_timestamp(F.col('delivery_date'), format_string1),
    'estimated_delivery_date': F.to_timestamp(F.col('estimated_delivery_date'), format_string2)
    })
clean_order_df = clean_order_df.orderBy(F.col('purchase_date').asc())
# clean_order_df.show(truncate=False)
write_iceberg(clean_order_df, f"{test_namespace}.clean_order")

25/09/19 11:16:53 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

In [None]:
# gold
delivery_stats = clean_order_df \
    .withColumn(
        'delivery_lead_time',
        F.datediff(F.col('delivery_date'), F.col('purchase_date'))) \
    .withColumn(
        'is_late',
        F.when(F.col('delivery_date') <= F.col('estimated_delivery_date'), False)
        .otherwise(True)
    )
# delivery_stats.show()
write_iceberg(delivery_stats, f"{test_namespace}.delivery_stats")



+--------------------+-------------------+-------------------+-----------------------+--------------------+------------------+-------+
|            order_id|      purchase_date|      delivery_date|estimated_delivery_date|         customer_id|delivery_lead_time|is_late|
+--------------------+-------------------+-------------------+-----------------------+--------------------+------------------+-------+
|3b697a20d9e427646...|2016-10-03 09:44:50|2016-10-26 14:02:13|    2016-10-27 00:00:00|32ea3bdedab835c3a...|                23|  false|
|be5bc2f0da14d8071...|2016-10-03 16:56:50|2016-10-27 18:19:38|    2016-11-07 00:00:00|2f64e403852e6893a...|                24|  false|
|65d1e226dfaeb8cdc...|2016-10-03 21:01:41|2016-11-08 10:58:34|    2016-11-25 00:00:00|b8b8726af116a5cfb...|                36|  false|
|a41c8759fbe7aab36...|2016-10-03 21:13:36|2016-11-03 10:58:07|    2016-11-29 00:00:00|61db744d2f835035a...|                31|  false|
|d207cc272675637bf...|2016-10-03 22:06:03|2016-10-31 11

                                                                                

In [None]:
# silver
order_item_df = spark.read.csv("s3a://warehousedev/bronze/tsv/order_item.tsv", header=True, sep='\t')
product_df = spark.read.csv("s3a://warehousedev/bronze/tsv/product.tsv", header=True, sep='\t')

taget_category = 'health_beauty'
taget_category_product = product_df.filter(product_df.category == taget_category).select(['product_id', 'category'])
taget_category_order = taget_category_product.join(order_item_df, on='product_id', how='inner')
taget_category_order = taget_category_order.sort('product_id')

# 하나의 order에 여러 item이 있을 수 있으므로, order_item_id는 남겨둔다.
# `total_price` 는 상품가와 배송비를 합산한 가격
taget_category_order = taget_category_order.withColumn('total_price', F.round(F.col('price') + F.col('freight_value'), 4))
taget_category_order = taget_category_order.drop('shipping_limit_date', 'price', 'freight_value')
clean_category_order_df = taget_category_order.join(clean_order_df.select(['order_id']), on='order_id', how='inner')
# clean_category_order_df.show(truncate=False)
write_iceberg(delivery_stats, f"{test_namespace}.clean_category_order")


[Stage 164:>                                                        (0 + 1) / 1]

+--------------------------------+--------------------------------+-------------+-------------+--------------------------------+-----------+
|order_id                        |product_id                      |category     |order_item_id|seller_id                       |total_price|
+--------------------------------+--------------------------------+-------------+-------------+--------------------------------+-----------+
|0e4672661531addf3fa0f55961e55242|dc861f18267cce21f3b70f9f7990b8ce|health_beauty|1            |17f51e7198701186712e53a39c564617|1304.02    |
|107478e48c13dc0b39a379510464cd9d|7518c51f502ca391864ea9b073e634d6|health_beauty|1            |ececbfcff9804a2d6b40f589df8eef2b|84.24      |
|1976e889aa37b60cfcaad40513117846|e0cf79767c5b016251fe139915c59a26|health_beauty|1            |da8622b14eb17ae2831f4ac5b9dab84a|37.61      |
|199c3d5d5456786637603c3975e504f3|632c37951cab36a4e928428cbc4fa4f4|health_beauty|1            |2e1c9f22be269ef4643f826c9e650a52|169.8      |
|1aaeb5badaa8

                                                                                

In [None]:
# gold
sale_stats = clean_category_order_df.groupBy('product_id').agg(
    F.count('order_id').alias('order_count'),
    F.round(F.sum(F.col('total_price')), 4).alias('total_sales')
    ).orderBy(F.col('order_count').desc())

sale_stats = sale_stats.withColumn('mean_sale', F.round(F.col('total_sales') / F.col('order_count'), 4))
# sale_stats.show(truncate=False)
write_iceberg(delivery_stats, f"{test_namespace}.sale_stats")



+--------------------------------+-----------+-----------+---------+
|product_id                      |order_count|total_sales|mean_sale|
+--------------------------------+-----------+-----------+---------+
|154e7e31ebfa092203795c972e5804a6|274        |9824.2     |35.8547  |
|2b4609f8948be18874494203496bc318|255        |26127.92   |102.4624 |
|7c1bd920dbdf22470b68bde975dd3ccf|220        |16735.7    |76.0714  |
|bb50f2e236e5eea0100680137654686c|194        |67258.03   |346.6909 |
|19c91ef95d509ea33eda93495c4d3481|154        |24038.01   |156.091  |
|6cdd53843498f92890544667809f1595|153        |57933.73   |378.6518 |
|437c05a395e9e47f9762e677a7068ce7|151        |9335.88    |61.827   |
|3fbc0ef745950c7932d5f2a446189725|149        |13680.15   |91.8131  |
|e0cf79767c5b016251fe139915c59a26|137        |5484.97    |40.0363  |
|4c2394abfbac7ff59ec7a420918562fa|120        |12205.38   |101.7115 |
|8c292ca193d326152e335d77176746f0|89         |15201.61   |170.8046 |
|7fb04722aba7a2b632bac8f9819796f3|

                                                                                

In [None]:
# silver: review metadata
review_df = spark.read.csv("s3a://warehousedev/bronze/tsv/review.tsv", header=True, sep='\t')

review_metadat_df = review_df.drop('review_comment_title', 'review_comment_message')
clean_review_metadata_df = review_metadat_df.join(clean_category_order_df.select('order_id', 'product_id'), on=['order_id'], how='inner')
clean_review_metadata_df = clean_review_metadata_df.orderBy('product_id')

format_string = "yyyy-MM-dd HH:mm:ss"
clean_review_metadata_df = clean_review_metadata_df \
    .withColumn('review_creation_date', F.to_timestamp(F.col('review_creation_date'), format_string)) \
    .withColumn('review_answer_timestamp', F.to_timestamp(F.col('review_answer_timestamp'), format_string))

clean_review_metadata_df = clean_review_metadata_df.withColumn('answer_lead_time', F.datediff(F.col('review_answer_timestamp'), F.col('review_creation_date')) )

clean_review_metadata_df = clean_review_metadata_df.drop('review_creation_date', 'review_answer_timestamp')
# clean_review_metadata_df.show()
write_iceberg(delivery_stats, f"{test_namespace}.clean_review_metadata")



+--------------------+--------------------+------------+--------------------+----------------+
|            order_id|           review_id|review_score|          product_id|answer_lead_time|
+--------------------+--------------------+------------+--------------------+----------------+
|c01f1e6ceafe26dc7...|f8bdbfeda4ecce455...|           5|00210e41887c2a8ef...|               5|
|1fd6c29ecb9dd8b65...|d5ce4524953171740...|           5|00210e41887c2a8ef...|               2|
|2d8e71bf7d31a41d4...|dd8d78ad2be2e3477...|           5|00210e41887c2a8ef...|               1|
|eb24356203f63304c...|13493da42ce41d718...|           5|00210e41887c2a8ef...|               1|
|2d8e71bf7d31a41d4...|dd8d78ad2be2e3477...|           5|00210e41887c2a8ef...|               1|
|226975521c585d7bb...|bde3a6bc851b615b1...|           1|00210e41887c2a8ef...|               1|
|eb24356203f63304c...|13493da42ce41d718...|           5|00210e41887c2a8ef...|               1|
|4ff907acfe03d4b4e...|cfc901be886cfe062...|       

                                                                                

In [None]:

for file_name in ['geolocation', 'customer', 'seller']:
    tmp_df = spark.read.csv(f"s3a://warehousedev/bronze/tsv/{file_name}.tsv", header=True, sep='\t')
    write_iceberg(delivery_stats, f"{tmp_df}.clean_review_metadata")