In [12]:
from pyspark.sql import functions as F
from service.utils.spark import get_spark_session
spark = get_spark_session(dev=True)

In [13]:
test_namespace = 'gold' 
# spark.sql(f"DROP NAMESPACE IF EXISTS {test_namespace} CASCADE")
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {test_namespace}").show()

++
||
++
++



In [14]:
compolet_order_id_df = spark.read.table('warehousedev.gold.complete_order_timestamp').select('order_id')
compolet_order_id_df.show(n=5, truncate=False)

+--------------------------------+
|order_id                        |
+--------------------------------+
|a9a93c428c6103f2151bb63a1d32a520|
|56ef80c564f6fd57cc662adee0379746|
|f9427374480e37251d5c279ebc41a3ab|
|d6d7c431275f0029dcc3538850930046|
|0957ed870116e596b800540427c61497|
+--------------------------------+
only showing top 5 rows



In [15]:
order_item_df = spark.read.csv("s3a://warehousedev/bronze/tsv/order_item.tsv", header=True, sep='\t')
product_df = spark.read.csv("s3a://warehousedev/bronze/tsv/product.tsv", header=True, sep='\t')

taget_category = 'health_beauty'
taget_category_product = product_df.filter(product_df.category == taget_category).select(['product_id', 'category'])
taget_category_order = taget_category_product.join(order_item_df, on='product_id', how='inner')
taget_category_order = taget_category_order.sort('product_id')

# 하나의 order에 여러 item이 있을 수 있으므로, order_item_id는 남겨둔다.
# `total_price` 는 상품가와 배송비를 합산한 가격
taget_category_order = taget_category_order.withColumn('total_price', F.round(F.col('price') + F.col('freight_value'), 4))
taget_category_order = taget_category_order.drop('shipping_limit_date', 'price', 'freight_value')
complete_target_category_order_df = taget_category_order.join(compolet_order_id_df, on='order_id', how='inner')

In [16]:
# gold
sale_stats = complete_target_category_order_df.groupBy('product_id').agg(
    F.count('order_id').alias('order_count'),
    F.round(F.sum(F.col('total_price')), 4).alias('total_sales')
    ).orderBy(F.col('order_count').desc())

sale_stats = sale_stats.withColumn('mean_sale', F.round(F.col('total_sales') / F.col('order_count'), 4))
# sale_stats.show(truncate=False)

In [23]:
# 1. 기준점(Threshold) 계산
# percentile_approx 함수를 사용하여 분위수 계산
order_count_threshold = sale_stats.agg(
    F.expr("percentile_approx(order_count, 0.75)")
).collect()[0][0]

median_avg_price = sale_stats.agg(
    F.expr("percentile_approx(mean_sale, 0.5)")
).collect()[0][0]

# print(order_count_threshold)
# print(median_avg_price)


# 2. 'group' 컬럼 추가
# when/otherwise와 col 함수를 사용하여 조건에 따라 그룹을 분류합니다.
target_product_sales_stats_segment = sale_stats.withColumn("segment",
    F.when((F.col("order_count") >= order_count_threshold) & (F.col("mean_sale") >= median_avg_price), "Star Products")
    .when((F.col("order_count") >= order_count_threshold) & (F.col("mean_sale") < median_avg_price), "Volume Drivers")
    .when((F.col("order_count") < order_count_threshold) & (F.col("mean_sale") >= median_avg_price), "Niche Gems")
    .otherwise("Question Marks")
)

target_product_sales_stats_segment.show(n=5, truncate=False)



+--------------------------------+-----------+-----------+---------+--------------+
|product_id                      |order_count|total_sales|mean_sale|segment       |
+--------------------------------+-----------+-----------+---------+--------------+
|154e7e31ebfa092203795c972e5804a6|274        |9824.2     |35.8547  |Volume Drivers|
|2b4609f8948be18874494203496bc318|255        |26127.92   |102.4624 |Star Products |
|7c1bd920dbdf22470b68bde975dd3ccf|220        |16735.7    |76.0714  |Volume Drivers|
|bb50f2e236e5eea0100680137654686c|194        |67258.03   |346.6909 |Star Products |
|19c91ef95d509ea33eda93495c4d3481|154        |24038.01   |156.091  |Star Products |
+--------------------------------+-----------+-----------+---------+--------------+
only showing top 5 rows



                                                                                

# 구매기록에 세그멘트  join

In [18]:
copmlete_order_timestamp = spark.read.table('warehousedev.gold.complete_order_timestamp')

In [25]:
target_product_purchase_info = copmlete_order_timestamp.join(complete_target_category_order_df, on='order_id', how='inner') \
    .select('order_id', 'order_item_id', 'purchase', 'product_id', 'category', 'seller_id', 'total_price')
target_product_purchase_info.show()



+--------------------+-------------+-------------------+--------------------+-------------+--------------------+-----------+
|            order_id|order_item_id|           purchase|          product_id|     category|           seller_id|total_price|
+--------------------+-------------+-------------------+--------------------+-------------+--------------------+-----------+
|0005a1a1728c9d785...|            1|2018-03-19 18:40:33|310ae3c140ff94b03...|health_beauty|a416b6a846a117243...|      157.6|
|00061f2a7bc09da83...|            1|2018-03-24 22:16:10|d63c1011f49d98b97...|health_beauty|cc419e0650a3c5ba7...|      68.87|
|001d8f0e34a38c37f...|            1|2017-05-14 17:19:44|e67307ff0f15ade43...|health_beauty|f4aba7c0bca51484c...|      26.77|
|001d8f0e34a38c37f...|            2|2017-05-14 17:19:44|e67307ff0f15ade43...|health_beauty|f4aba7c0bca51484c...|      26.77|
|001e7cf2ad6bef3ad...|            1|2018-05-19 10:29:23|bdcf6a834e8faa30d...|health_beauty|2a84855fd20af891b...|       51.1|


                                                                                

In [26]:
target_product_purchase_order_info_with_segment = target_product_purchase_info.join(target_product_sales_stats_segment.select('product_id', 'segment'), on='product_id', how='inner')


In [27]:
target_product_purchase_order_info_with_segment.show(n=5)

                                                                                

+--------------------+--------------------+-------------+-------------------+-------------+--------------------+-----------+-------------+
|          product_id|            order_id|order_item_id|           purchase|     category|           seller_id|total_price|      segment|
+--------------------+--------------------+-------------+-------------------+-------------+--------------------+-----------+-------------+
|05f0fe07929d35be0...|362cd36939c1c4638...|            1|2018-06-25 23:00:42|health_beauty|c70c1b0d8ca86052f...|     118.35|Star Products|
|05f0fe07929d35be0...|b41cc5a0872a7fd88...|            1|2018-07-16 10:53:45|health_beauty|c70c1b0d8ca86052f...|     118.35|Star Products|
|05f0fe07929d35be0...|b71ba668b12da5320...|            1|2018-06-03 10:47:54|health_beauty|c70c1b0d8ca86052f...|     117.96|Star Products|
|05f0fe07929d35be0...|bcd881f346c66116b...|            2|2018-07-21 11:39:34|health_beauty|c70c1b0d8ca86052f...|     129.19|Star Products|
|05f0fe07929d35be0...|d1c3d

In [None]:
target_product_purchase_order_info_with_segment.writeTo(f"{test_namespace}.target_product_purchase_order_info_with_segment").createOrReplace()

                                                                                

In [29]:
spark.stop()