# Packages Loading

In [1]:
import os

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# Spark Config Setting

In [2]:
SPARK_JAR_DIR = os.getenv("SPARK_JAR_DIR", "/opt/spark/jars")

SPARK_STAND_ALONE_CORES = {
    "spark.cores.max": 3
}

SPARK_DRIVER_CONFIGS = {
    "spark.driver.memory": "1g"
}

SPARK_EXECUTOR_CONFIGS = {
    "spark.executor.cores": 1,
    "spark.executor.instances": 1,
    "spark.executor.memory": "1g",
}

spark = SparkSession.builder \
    .appName("etl_snapshot") \
    .master("spark://spark-master:7077") \
    .config("spark.jars", f"{SPARK_JAR_DIR}/mysql-connector-java-8.0.28.jar") \
    .config("spark.ui.port", "4040") \
    .config(map=SPARK_DRIVER_CONFIGS) \
    .config(map=SPARK_EXECUTOR_CONFIGS) \
    .config(map=SPARK_STAND_ALONE_CORES) \
    .getOrCreate()

spark.sparkContext.setJobGroup("Monthly User Movements Snapshot", "test2", interruptOnCancel=False)

# RDB Connection Info

In [3]:
DB_CONN_INFO = {
    "user": "root",
    "password": "root",
    "url": "jdbc:mysql://mysql:3306/mysql",
    "driver": "com.mysql.cj.jdbc.Driver"
}

# Data Sourcing Query

### 1. 검색 조건 일자 세팅 (Delta Load)

In [4]:
start_dt = '1970-01-01'
end_dt = '2022-07-31'

### 2. 검색 조건에 부합하는 전체 데이터 셋 집합 카운트 수 산출

In [5]:
count_sql = f"""
SELECT MAX(customer_id)
FROM (
    SELECT DISTINCT customer_id
                  , DATE_FORMAT(date_id, '%Y-%m') AS order_month
    FROM dw.fact_transacation
    WHERE payment_status = 'Success'
    AND date_id BETWEEN '{start_dt}' AND '{end_dt}'
) AS user_monthly_order
"""

In [6]:
# 검색할 범주의 전체 행의 개수 산출 (파티션 수를 균등하게 분할하여 읽기 위한 전체 개수 탐색)
result_row =  spark.read.format("jdbc") \
    .options(**DB_CONN_INFO) \
    .option("query", count_sql) \
    .load().first()[0]

In [7]:
result_row

99999

### 3. 파티션으로 균등 분할하여 데이터 Read

In [8]:
dbtable = f"""(
SELECT customer_id
     , DATE(ym01) AS ym_first_day
     , DATE_FORMAT(ym01, '%Y-%m') AS order_month
     , LAG(ym01, 1) OVER (PARTITION BY customer_id ORDER BY ym01) AS prior_order_month
     , TIMESTAMPDIFF(MONTH, LAG(ym01, 1) OVER (PARTITION BY customer_id ORDER BY ym01), ym01) AS order_month_diff
FROM (
    SELECT DISTINCT customer_id
                  , DATE_FORMAT(date_id, '%Y-%m-01') AS ym01
    FROM dw.fact_transacation
    WHERE payment_status = 'Success'
    AND date_id BETWEEN '{start_dt}' AND '{end_dt}'
    ) AS user_monthly_order
) as subquery"""

df = spark.read.format("jdbc") \
    .options(**DB_CONN_INFO) \
    .option("numPartitions", 3) \
    .option("dbtable", dbtable) \
    .option("partitionColumn", "customer_id") \
    .option("lowerBound", 1) \
    .option("upperBound", result_row) \
    .load()

# df.cache()

In [9]:
# df.rdd.mapPartitionsWithIndex(lambda idx, it: [(idx, sum(1 for _ in it))]).collect()

In [10]:
# 파티션 개수
# df.rdd.getNumPartitions()

In [11]:
# 파티션별 (파티션 인덱스, 데이터) 개수 => 9개 파티션으로 데이터를 분할해서 읽음 (각 파티션에는 customer_id별로 모아져있음)
# df.rdd.mapPartitionsWithIndex(lambda idx, it: [(idx, sum(1 for _ in it))]).collect()

In [12]:
# 데이터프레임 스키마 확인
# df.printSchema()

In [13]:
# Case 1 : order_month_diff => null. 그 달의 신규 회원.
new = df.filter(F.col("order_month_diff").isNull()).groupBy("order_month").agg(F.count("order_month").alias("new_cnt"))

# Case 2 : order_month_diff => 3 이상. 복귀 사용자
old = df.filter(F.col("order_month_diff") >= 3).groupBy("order_month").agg(F.count("order_month").alias("old_cnt"))

# Case 3 : order_month_diff => 3 미만. 기존 사용자
exist = df.filter(F.col("order_month_diff") <= 2).groupBy("order_month").agg(F.count("order_month").alias("exist_cnt"))

In [14]:
new_old = new.join(other=old, on="order_month", how="full_outer")

In [15]:
new_old_exist = new_old.join(other=exist, on="order_month", how="full_outer")
new_old_exist = new_old_exist.na.fill(0)
new_old_exist = new_old_exist.withColumn("mau", F.col("new_cnt") + F.col("old_cnt") + F.col("exist_cnt"))

In [16]:
new_old_exist.show(200)

+-----------+-------+-------+---------+-----+
|order_month|new_cnt|old_cnt|exist_cnt|  mau|
+-----------+-------+-------+---------+-----+
|    2016-06|      1|      0|        0|    1|
|    2016-07|    268|      0|        0|  268|
|    2016-08|    423|      0|       43|  466|
|    2016-09|    435|      0|      150|  585|
|    2016-10|    512|     10|      252|  774|
|    2016-11|    488|     42|      372|  902|
|    2016-12|    280|     54|      494|  828|
|    2017-01|    498|    112|      532| 1142|
|    2017-02|    471|    138|      620| 1229|
|    2017-03|    516|    144|      746| 1406|
|    2017-04|    498|    171|      886| 1555|
|    2017-05|    463|    221|     1008| 1692|
|    2017-06|    277|    258|     1105| 1640|
|    2017-07|    981|    303|     1183| 2467|
|    2017-08|    590|    346|     1393| 2329|
|    2017-09|    592|    324|     1611| 2527|
|    2017-10|    655|    456|     1731| 2842|
|    2017-11|    543|    455|     1871| 2869|
|    2017-12|    386|    532|     

In [17]:
new_old_exist.explain(extended=True)

== Parsed Logical Plan ==
'Project [order_month#53, new_cnt#62L, old_cnt#63L, exist_cnt#64L, (('new_cnt + 'old_cnt) + 'exist_cnt) AS mau#69]
+- Project [order_month#53, coalesce(new_cnt#18L, cast(0.0 as bigint)) AS new_cnt#62L, coalesce(old_cnt#27L, cast(0.0 as bigint)) AS old_cnt#63L, coalesce(exist_cnt#36L, cast(0.0 as bigint)) AS exist_cnt#64L]
   +- Project [coalesce(order_month#44, order_month#50) AS order_month#53, new_cnt#18L, old_cnt#27L, exist_cnt#36L]
      +- Join FullOuter, (order_month#44 = order_month#50)
         :- Project [coalesce(order_month#4, order_month#41) AS order_month#44, new_cnt#18L, old_cnt#27L]
         :  +- Join FullOuter, (order_month#4 = order_month#41)
         :     :- Aggregate [order_month#4], [order_month#4, count(order_month#4) AS new_cnt#18L]
         :     :  +- Filter isnull(order_month_diff#6L)
         :     :     +- Relation [customer_id#2,ym_first_day#3,order_month#4,prior_order_month#5,order_month_diff#6L] JDBCRelation((
SELECT customer_id

In [None]:
spark.stop()