In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
from service.utils.spark import get_spark_session

In [2]:
spark = get_spark_session(dev=True)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/25 13:16:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [64]:
test_namespace = 'gold' 
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {test_namespace}")

DataFrame[]

In [65]:
delivered_order_product = spark.read.table(f"{test_namespace}.delivered_order_product")

In [None]:
TARGET_CATEGORY = 'health_beauty'
target_order_product = delivered_order_product.filter(F.col('category') == TARGET_CATEGORY)

In [67]:
tmp = spark.read.table(f"{test_namespace}.delivered_order_timestamp")
order_customer = tmp.select('order_id', 'customer_id', 'purchase')

In [68]:
target_purchase_history = target_order_product.join(order_customer, on='order_id', how='inner')
target_purchase_history = target_purchase_history.dropDuplicates()
# delivered_order_product_customer = delivered_order_product_customer.dropna()   # null 은 카테고리에만 있음. retention 집계에서는 필요하므로 일단 포함한다.

In [69]:
"""
null 기록 격리
"""
from functools import reduce
all_cols = target_purchase_history.columns
null_conditions = [F.col(c).isNull() for c in all_cols]

# reduce와 OR(|) 연산자를 사용해 모든 조건을 하나로 합치기
# 결과: Column<'((firstname IS NULL) OR (middlename IS NULL)) ...'>
final_condition = reduce(lambda a, b: a | b, null_conditions)
order_product_at_least_null = target_purchase_history.filter(final_condition)

In [70]:
order_product_at_least_null.show()

                                                                                

+--------+----------+--------+-------------+---------+-----------+-----------+--------+
|order_id|product_id|category|order_item_id|seller_id|total_price|customer_id|purchase|
+--------+----------+--------+-------------+---------+-----------+-----------+--------+
+--------+----------+--------+-------------+---------+-----------+-----------+--------+



In [71]:
clean_target_purchase_history = target_purchase_history.dropna()

In [72]:
clean_target_purchase_history.show()

+--------------------+--------------------+-------------+-------------+--------------------+-----------+--------------------+-------------------+
|            order_id|          product_id|     category|order_item_id|           seller_id|total_price|         customer_id|           purchase|
+--------------------+--------------------+-------------+-------------+--------------------+-----------+--------------------+-------------------+
|028ad19261e3ad340...|9453bde60c4ee5215...|health_beauty|            1|5c243662ce92d8457...|     485.02|e0a32719d497e50ca...|2017-05-18 15:19:09|
|02f8584bbcb3aeae8...|e4985e03cd939f1a9...|health_beauty|            1|edb1ef5e36e0c8cd8...|     106.07|d8020dbc8d6b3281e...|2018-06-11 17:29:39|
|0497899302217e8f8...|cd8c7501d1e3a66f2...|health_beauty|            1|392e0502231ae2f8b...|     121.16|56d9f7e21e5646dbe...|2017-02-07 11:21:12|
|0aa1d9509ae8e0c76...|7c0a2a31bc479e366...|health_beauty|            1|5c243662ce92d8457...|     180.27|d8f979fcfc0ce686f...

                                                                                

In [73]:
purchase_stats = clean_target_purchase_history.groupBy('customer_id').agg(
    F.count('product_id').alias('total_purchase_count'),
    F.countDistinct('category').alias('total_category_count'),
    F.min('purchase').alias('first_purchase_date'),
    F.max('purchase').alias('last_purchase_date')
    )
purchase_stats.show()



+--------------------+--------------------+--------------------+-------------------+-------------------+
|         customer_id|total_purchase_count|total_category_count|first_purchase_date| last_purchase_date|
+--------------------+--------------------+--------------------+-------------------+-------------------+
|f86def7712b8dd86f...|                   1|                   1|2017-07-23 15:34:47|2017-07-23 15:34:47|
|538f20c5da86daf31...|                   1|                   1|2017-07-06 16:18:52|2017-07-06 16:18:52|
|6fe3cecd4de549447...|                   1|                   1|2018-05-08 16:39:50|2018-05-08 16:39:50|
|a21ef70a8eb4b1b65...|                   1|                   1|2018-01-03 22:20:04|2018-01-03 22:20:04|
|e2eece9925247f8d5...|                   1|                   1|2018-08-08 07:11:24|2018-08-08 07:11:24|
|b68acf578f10bd6ec...|                   1|                   1|2017-12-01 22:07:21|2017-12-01 22:07:21|
|101d2a28e8392e031...|                   1|            

                                                                                

In [81]:
purchase_stats.filter(F.col('customer_id') == '10b442e762ba8a5ad22de5ba2207e5f1').show()

                                                                                

+--------------------+--------------------+--------------------+-------------------+-------------------+
|         customer_id|total_purchase_count|total_category_count|first_purchase_date| last_purchase_date|
+--------------------+--------------------+--------------------+-------------------+-------------------+
|10b442e762ba8a5ad...|                   4|                   1|2018-04-02 21:47:07|2018-06-15 08:23:31|
+--------------------+--------------------+--------------------+-------------------+-------------------+



- If `total_purchase_date` > 1 and `first_purchase_date` == `last_purchase_date`: 동일 상품 여러개 구매

In [74]:
date_condition = F.col('first_purchase_date') < F.col('last_purchase_date')
retention_cadidate = purchase_stats.filter(date_condition).select('customer_id')

In [75]:
retention_cadidate.show()

+--------------------+
|         customer_id|
+--------------------+
|5a2e847dd085d36e3...|
|e7ea4c3049b0ffcb3...|
|31aa2034dc4b34b06...|
|ef309784561fed416...|
|5b3186d7cf904f2f9...|
|5c117c579cb98a9c6...|
|3daa1c97a3155484f...|
|91e59b92e87f627ab...|
|7b51437497e9c9b82...|
|3a41ac64aa715256d...|
|5a6b37816cefa8497...|
|771f344c1ac13d8cc...|
|5c8b41ba21631ca04...|
|bb4d84a2b45b22ed7...|
|798c34ffa90473998...|
|4d7b6c39aa3c7b53a...|
|da29b8a3970adddb9...|
|674e552503d21cd54...|
|b64ebaf3d11b7209f...|
|821e75291b1ad362e...|
+--------------------+
only showing top 20 rows



In [84]:
retention_history = retention_cadidate.join(clean_target_purchase_history.select('customer_id', 'purchase'), on='customer_id', how='left')
retention_history = retention_history.dropDuplicates()

In [95]:
from pyspark.sql.window import Window

df = retention_history

window_spec = Window.partitionBy("customer_id").orderBy("purchase")
df_with_lag = df.withColumn(
    "previous_purchase_date",
    F.lag("purchase", 1).over(window_spec)
)
# 현재 구매일과 이전 구매일의 차이를 계산하여 purchase_interval 열 추가
df_with_interval = df_with_lag.withColumn(
    "purchase_interval_second",
    # F.datediff(F.col("purchase"), F.col("previous_purchase_date"))
    F.col("purchase").cast("long") - F.col("previous_purchase_date").cast("long")
)

In [96]:
df_with_interval.orderBy('customer_id').show(n=3, truncate=False)



+--------------------------------+-------------------+----------------------+------------------------+
|customer_id                     |purchase           |previous_purchase_date|purchase_interval_second|
+--------------------------------+-------------------+----------------------+------------------------+
|01886ef98f995e4f2dd75a1d04c97397|2017-11-27 20:47:54|NULL                  |NULL                    |
|01886ef98f995e4f2dd75a1d04c97397|2018-03-09 19:19:24|2017-11-27 20:47:54   |8807490                 |
|04d3c2795b83634312bf3f86f41f7901|2017-06-09 17:38:12|NULL                  |NULL                    |
+--------------------------------+-------------------+----------------------+------------------------+
only showing top 3 rows



                                                                                

In [None]:
clean_df_with_interval = df_with_interval.dropna()
clean_df_with_interval.orderBy('customer_id').show()



+--------------------+-------------------+----------------------+------------------------+
|         customer_id|           purchase|previous_purchase_date|purchase_interval_second|
+--------------------+-------------------+----------------------+------------------------+
|01886ef98f995e4f2...|2018-03-09 19:19:24|   2017-11-27 20:47:54|                 8807490|
|04d3c2795b8363431...|2017-08-08 20:46:53|   2017-06-09 17:38:12|                 5195321|
|08c07c761592a71b8...|2018-01-26 22:03:52|   2017-08-11 12:22:46|                14550066|
|0981a271500799427...|2017-04-17 10:05:11|   2017-04-03 13:57:14|                 1195677|
|0cc2ee1112a11a109...|2017-10-03 11:54:16|   2017-05-15 09:00:01|                12192855|
|0dd933d1e02533430...|2017-11-24 12:08:59|   2017-06-13 11:11:47|                14173032|
|10b442e762ba8a5ad...|2018-06-15 08:23:31|   2018-04-02 21:47:07|                 6345384|
|1118bbd6c82a824eb...|2017-10-28 00:24:21|   2017-10-28 00:24:20|                       1|

                                                                                

In [101]:
mean_purchase_interval_sec = clean_df_with_interval.groupBy('customer_id').agg(F.mean('purchase_interval_second').alias('mean_purchase_interval_sec '))
mean_purchase_interval_sec.show()

                                                                                

+--------------------+---------------------------+
|         customer_id|mean_purchase_interval_sec |
+--------------------+---------------------------+
|5a2e847dd085d36e3...|                      986.0|
|31aa2034dc4b34b06...|                  1360746.0|
|e7ea4c3049b0ffcb3...|                  6742362.0|
|ef309784561fed416...|                  3568372.0|
|5b3186d7cf904f2f9...|                     1741.0|
|5c117c579cb98a9c6...|                    36231.0|
|3daa1c97a3155484f...|                 2.429645E7|
|91e59b92e87f627ab...|                    58909.0|
|7b51437497e9c9b82...|                  1462991.0|
|3a41ac64aa715256d...|                  5534199.0|
|5a6b37816cefa8497...|                        1.0|
|771f344c1ac13d8cc...|                1.4413189E7|
|5c8b41ba21631ca04...|                  1815465.0|
|bb4d84a2b45b22ed7...|                        1.0|
|798c34ffa90473998...|                     1575.0|
|4d7b6c39aa3c7b53a...|                1.6166917E7|
|da29b8a3970adddb9...|         

In [93]:
purchase_stats.show()



+--------------------+--------------------+--------------------+-------------------+-------------------+
|         customer_id|total_purchase_count|total_category_count|first_purchase_date| last_purchase_date|
+--------------------+--------------------+--------------------+-------------------+-------------------+
|f86def7712b8dd86f...|                   1|                   1|2017-07-23 15:34:47|2017-07-23 15:34:47|
|538f20c5da86daf31...|                   1|                   1|2017-07-06 16:18:52|2017-07-06 16:18:52|
|6fe3cecd4de549447...|                   1|                   1|2018-05-08 16:39:50|2018-05-08 16:39:50|
|a21ef70a8eb4b1b65...|                   1|                   1|2018-01-03 22:20:04|2018-01-03 22:20:04|
|e2eece9925247f8d5...|                   1|                   1|2018-08-08 07:11:24|2018-08-08 07:11:24|
|b68acf578f10bd6ec...|                   1|                   1|2017-12-01 22:07:21|2017-12-01 22:07:21|
|101d2a28e8392e031...|                   1|            

                                                                                

In [None]:
spark.stop()