# Packages Loading

In [17]:
from dateutil.relativedelta import relativedelta
from datetime import datetime as dt
import os

import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, DateType
from pyspark.sql.window import Window

# Spark Config Setting

In [18]:
SPARK_JAR_DIR = os.getenv("SPARK_JAR_DIR", "/opt/spark/jars")

SPARK_STAND_ALONE_CORES = {
    "spark.cores.max": 3
}

SPARK_DRIVER_CONFIGS = {
    "spark.driver.memory": "1g"
}

SPARK_EXECUTOR_CONFIGS = {
    "spark.executor.cores": 1,
    "spark.executor.instances": 1,
    "spark.executor.memory": "1g",
}

spark = SparkSession.builder \
    .appName("etl_snapshot") \
    .master("spark://spark-master:7077") \
    .config("spark.jars", f"{SPARK_JAR_DIR}/mysql-connector-java-8.0.28.jar") \
    .config("spark.ui.port", "4040") \
    .config(map=SPARK_DRIVER_CONFIGS) \
    .config(map=SPARK_EXECUTOR_CONFIGS) \
    .config(map=SPARK_STAND_ALONE_CORES) \
    .getOrCreate()

spark.sparkContext.setJobGroup("snap_monthly_user_repurchase_cycle_hist", "The overall median of the average repurchase cycle per user over one year", interruptOnCancel=False)

# RDB Connection Info

In [19]:
DB_CONN_INFO = {
    "user": "root",
    "password": "root",
    "url": "jdbc:mysql://mysql:3306/mysql",
    "driver": "com.mysql.cj.jdbc.Driver"
}

# Data Sourcing

### 1. 검색 조건 일자 세팅 (Delta Load)

In [20]:
start_de = '2016-06-01'
end_de = '2022-07-31'

### 검색 조건을 넣어 주문데이터의 유저별 주문일자의 LAG와 그 차이 일수를 산출하는 쿼리 실행 (동일 날짜에 여러 번 구매는 한 건으로만 카운팅하기 위해 제거)

In [21]:
source_sql = f"""
    SELECT *
    FROM (
        SELECT CUST_ID
             , LAG(ORDR_DE) OVER (PARTITION BY CUST_ID ORDER BY ORDR_DE) AS PRIOR_PURCHASE
             , ORDR_DE 
             , DATEDIFF(ORDR_DE, LAG(ORDR_DE) OVER (PARTITION BY CUST_ID ORDER BY ORDR_DE)) AS REPURCHASE_CYCLE
        FROM dm.dm_f_ordr
        WHERE ordr_de BETWEEN '{start_de}' AND '{end_de}'
    ) AS cust_repurchase
    WHERE PRIOR_PURCHASE != 0
"""

In [22]:
# 검색할 범주의 전체 행의 개수 산출 (파티션 수를 균등하게 분할하여 읽기 위한 전체 개수 탐색)
sdf = spark.read.format("jdbc") \
    .options(**DB_CONN_INFO) \
    .option("query", source_sql) \
    .load()

In [23]:
sdf.cache()

DataFrame[CUST_ID: int, PRIOR_PURCHASE: date, ORDR_DE: date, REPURCHASE_CYCLE: int]

# 매일 월별 스냅샷 쿼리 (멱등성 처리)

In [24]:
date_range = []

for dt in pd.date_range(start=start_de, end=end_de, freq='MS'):
    snapshot_stdr_de = dt.date() 
    one_year_ago_de = snapshot_stdr_de - relativedelta(years=1) # 1년 전의 날짜 구하기
    one_day_before_de = snapshot_stdr_de - relativedelta(days=1) # 하루 전의 날짜 구하기

    date_range.append({
        "snapshot_stdr_de": snapshot_stdr_de,
        "one_year_ago_de": one_year_ago_de,
        "one_day_before_de": one_day_before_de
    })

In [25]:
schema = StructType([
    StructField("SNAPSHOT_STDR_DE", DateType(), False),
    StructField("INTERVAL_START_DE", DateType(), False),
    StructField("INTERVAL_END_DE", DateType(), False),
    StructField("ALL_REPURCHASE_CYCLE_MEDIAN", IntegerType(), True)
])

# 빈 DataFrame 생성
result_sdf = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

for row in date_range:

    sdf_2 = sdf.filter(F.col("ORDR_DE").between(row["one_year_ago_de"], row["one_day_before_de"]))

    sdf_3 = sdf_2.withColumn("row_num", F.row_number().over(Window.partitionBy("CUST_ID").orderBy("ORDR_DE")))

    sdf_4 = sdf_3.filter(F.col("row_num") > 1)

    sdf_5 = sdf_4.groupBy("CUST_ID").agg(F.avg("REPURCHASE_CYCLE").alias("USER_REPUCHASE_CYCLE_AVG"))

    sdf_6 = sdf_5.agg(F.ceil(F.median("USER_REPUCHASE_CYCLE_AVG")).alias("ALL_REPURCHASE_CYCLE_MEDIAN"))

    sdf_7 = sdf_6.withColumn("SNAPSHOT_STDR_DE", F.lit(row["snapshot_stdr_de"]).cast(DateType())) \
        .withColumn("INTERVAL_START_DE", F.lit(row["one_year_ago_de"]).cast(DateType())) \
        .withColumn("INTERVAL_END_DE", F.lit(row["one_day_before_de"]).cast(DateType()))
    
    result_sdf = result_sdf.union(sdf_7.select(["SNAPSHOT_STDR_DE", "INTERVAL_START_DE", "INTERVAL_END_DE", "ALL_REPURCHASE_CYCLE_MEDIAN"]))

In [26]:
result_sdf.show(n=100)

+----------------+-----------------+---------------+---------------------------+
|SNAPSHOT_STDR_DE|INTERVAL_START_DE|INTERVAL_END_DE|ALL_REPURCHASE_CYCLE_MEDIAN|
+----------------+-----------------+---------------+---------------------------+
|      2016-06-01|       2015-06-01|     2016-05-31|                       NULL|
|      2016-07-01|       2015-07-01|     2016-06-30|                       NULL|
|      2016-08-01|       2015-08-01|     2016-07-31|                          9|
|      2016-09-01|       2015-09-01|     2016-08-31|                         12|
|      2016-10-01|       2015-10-01|     2016-09-30|                         18|
|      2016-11-01|       2015-11-01|     2016-10-31|                         20|
|      2016-12-01|       2015-12-01|     2016-11-30|                         25|
|      2017-01-01|       2016-01-01|     2016-12-31|                         27|
|      2017-02-01|       2016-02-01|     2017-01-31|                         30|
|      2017-03-01|       201

# pyspark.sql.DataFrame을 pandas.DataFrame 으로 변환

In [27]:
result_pdf = result_sdf.toPandas()

In [28]:
result_pdf.dtypes

SNAPSHOT_STDR_DE                object
INTERVAL_START_DE               object
INTERVAL_END_DE                 object
ALL_REPURCHASE_CYCLE_MEDIAN    float64
dtype: object

In [29]:
date_type_cols = ("SNAPSHOT_STDR_DE", "INTERVAL_START_DE", "INTERVAL_END_DE")

for col in date_type_cols:
    result_pdf[col] = pd.to_datetime(result_pdf[col])

## 22.07 스냅샷 (21.07~22.06)의 (고객별 평균 재구매주기의 전체 중앙값)의 이동평균값 산출 

In [30]:
result_pdf[result_pdf["SNAPSHOT_STDR_DE"] == '2022-07-01'].ALL_REPURCHASE_CYCLE_MEDIAN

73    49.0
Name: ALL_REPURCHASE_CYCLE_MEDIAN, dtype: float64

In [31]:
result_pdf.to_excel("snap_monthly_all_user_repurchase_cycle_median.xlsx", index=False)

# Stop SparkSession

In [32]:
spark.stop()