## Window 함수: ROWS BETWEEN AND 이해 실습

In [0]:
rows_test = [
    { 'value': 1, 'name': 'Luka', 'ts': '2025-04-01' },
    { 'value': 2, 'name': 'Luka', 'ts': '2025-04-03'},
    { 'value': 3, 'name': 'Dirk', 'ts': '2025-04-01'},
    { 'value': 4, 'name': 'Dirk', 'ts': '2025-04-30' },
    { 'value': 5, 'name': 'Luka', 'ts': '2025-04-30' },
]

df = spark.createDataFrame(rows_test)

In [0]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- ts: string (nullable = true)
 |-- value: long (nullable = true)



In [0]:
df.show()

+----+----------+-----+
|name|        ts|value|
+----+----------+-----+
|Luka|2025-04-01|    1|
|Luka|2025-04-03|    2|
|Dirk|2025-04-01|    3|
|Dirk|2025-04-30|    4|
|Luka|2025-04-30|    5|
+----+----------+-----+



In [0]:
df.createOrReplaceTempView("rows_test")

### 앞에서 봤던 rolling sum 5개를 구하기

In [0]:
spark.sql("""
  SELECT 
    value,
    SUM(value) OVER (
        ORDER BY value 
        ROWS BETWEEN 2 PRECEDING and 2 FOLLOWING
    ) AS rolling_sum
  FROM rows_test""").show()

+-----+-----------+
|value|rolling_sum|
+-----+-----------+
|    1|          6|
|    2|         10|
|    3|         15|
|    4|         14|
|    5|         12|
+-----+-----------+



### 이번에는 앞에는 모두 더하고 뒤는 2개만 더하기

In [0]:
spark.sql("""
  SELECT 
    value,
    SUM(value) OVER (
        ORDER BY value 
        ROWS BETWEEN UNBOUNDED PRECEDING AND 2 FOLLOWING
    ) AS rolling_sum
  FROM rows_test""").show()

+-----+-----------+
|value|rolling_sum|
+-----+-----------+
|    1|          6|
|    2|         10|
|    3|         15|
|    4|         15|
|    5|         15|
+-----+-----------+



### 같은 name을 갖는 레코드들중에서 시간순으로 봤을 때 (ts) 처음 value(initial_value)와 마지막 value(last_value)를 알고 싶다면?

In [0]:
spark.sql("""
  SELECT 
    *,
    FIRST_VALUE(value) OVER (
        PARTITION BY name
        ORDER BY ts 
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS initial_value,
    LAST_VALUE(value) OVER (
        PARTITION BY name
        ORDER BY ts 
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS last_value    
  FROM rows_test
  ORDER BY name""").show()

+----+----------+-----+-------------+----------+
|name|        ts|value|initial_value|last_value|
+----+----------+-----+-------------+----------+
|Dirk|2025-04-01|    3|            3|         4|
|Dirk|2025-04-30|    4|            3|         4|
|Luka|2025-04-01|    1|            1|         5|
|Luka|2025-04-03|    2|            1|         5|
|Luka|2025-04-30|    5|            1|         5|
+----+----------+-----+-------------+----------+



In [0]:
spark.sql("""
  SELECT 
    name,
    FIRST_VALUE(value) OVER (
        PARTITION BY name
        ORDER BY ts 
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS initial_value,
    LAST_VALUE(value) OVER (
        PARTITION BY name
        ORDER BY ts 
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS last_value    
  FROM rows_test
  ORDER BY name""").show()

+----+-------------+----------+
|name|initial_value|last_value|
+----+-------------+----------+
|Dirk|            3|         4|
|Dirk|            3|         4|
|Luka|            1|         5|
|Luka|            1|         5|
|Luka|            1|         5|
+----+-------------+----------+



In [0]:
spark.sql("""
  SELECT 
    DISTINCT
    name,
    FIRST_VALUE(value) OVER (
        PARTITION BY name
        ORDER BY ts 
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS initial_value,
    LAST_VALUE(value) OVER (
        PARTITION BY name
        ORDER BY ts 
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS last_value    
  FROM rows_test
  ORDER BY name""").show()

+----+-------------+----------+
|name|initial_value|last_value|
+----+-------------+----------+
|Dirk|            3|         4|
|Luka|            1|         5|
+----+-------------+----------+



### 같은 name을 갖는 레코드들에 시간순으로 일련번호를 붙이고 싶다면?

In [0]:
spark.sql("""
  SELECT 
    *,
    FIRST_VALUE(value) OVER (
        PARTITION BY name
        ORDER BY ts 
        rows between unbounded preceding and unbounded following
    ) AS initial_value,
    LAST_VALUE(value) OVER (
        PARTITION BY name
        ORDER BY ts 
        rows between unbounded preceding and unbounded following
    ) AS last_value,
    ROW_NUMBER() OVER (
        PARTITION BY name
        ORDER BY ts
    ) AS seq   
  FROM rows_test
  ORDER BY name""").show()

+----+----------+-----+-------------+----------+---+
|name|        ts|value|initial_value|last_value|seq|
+----+----------+-----+-------------+----------+---+
|Dirk|2025-04-01|    3|            3|         4|  1|
|Dirk|2025-04-30|    4|            3|         4|  2|
|Luka|2025-04-01|    1|            1|         5|  1|
|Luka|2025-04-03|    2|            1|         5|  2|
|Luka|2025-04-30|    5|            1|         5|  3|
+----+----------+-----+-------------+----------+---+



## 사용자별로 처음 채널과 마지막 채널을 알아내기

### 데이터셋 준비하기

In [0]:
df_user_session_channel = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("s3a://s3-geospatial/readonly/user_session_channel.csv")

df_session_timestamp = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("s3a://s3-geospatial/readonly/session_timestamp.csv")

df_session_transaction = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("s3a://s3-geospatial/readonly/session_transaction.csv")

In [0]:
df_user_session_channel.createOrReplaceTempView("user_session_channel")
df_session_timestamp.createOrReplaceTempView("session_timestamp")
df_session_transaction.createOrReplaceTempView("session_transaction")

In [0]:
df_user_session_channel.show(5)

+------+--------------------+--------+
|userid|           sessionid| channel|
+------+--------------------+--------+
|   184|c41dd99a69df04044...|   Naver|
|    80|fdc0eb412a84fa549...| Organic|
|   251|0a54b19a13b6712dc...|Facebook|
|   264|a914ecef9c12ffdb9...|  Google|
|   744|05ae14d7ae387b933...|Facebook|
+------+--------------------+--------+
only showing top 5 rows



In [0]:
df_session_timestamp.show(5)

+--------------------+--------------------+
|           sessionid|                  ts|
+--------------------+--------------------+
|7cdace91c487558e2...|2019-05-01 00:13:...|
|94f192dee566b018e...|2019-05-01 00:49:...|
|7ed2d3454c5eea711...|2019-05-01 10:18:...|
|f1daf122cde863010...|2019-05-01 13:10:...|
|fd0efcca272f704a7...|2019-05-01 13:45:...|
+--------------------+--------------------+
only showing top 5 rows



In [0]:
df_session_transaction.show(5)

+--------------------+--------+------+
|           sessionid|refunded|amount|
+--------------------+--------+------+
|00029153d12ae1c9a...|   false|    85|
|008909bd27b680698...|   false|    13|
|0107acb41ef20db22...|   false|    16|
|018544a2c48077d2c...|   false|    39|
|020c38173caff0203...|   false|    61|
+--------------------+--------+------+
only showing top 5 rows



### ROW_NUMBER를 사용해서 사용자별 처음 채널과 마지막 채널 알아내기

In [0]:
spark.sql("""
WITH RECORD AS (
  SELECT /*사용자의 유입에 따른, 채널 순서 매기는 쿼리*/
      userid,
      channel, 
      ROW_NUMBER() OVER (PARTITION BY userid ORDER BY ts ASC) AS seq_first,
      ROW_NUMBER() OVER (PARTITION BY userid ORDER BY ts DESC) AS seq_last
  FROM user_session_channel u
  LEFT JOIN session_timestamp t
    ON u.sessionid = t.sessionid
)
SELECT /*유저의 첫번째 유입채널, 마지막 유입 채널 구하기*/
      f.userid,
      f.channel first_channel,
      l.channel last_channel
FROM RECORD f
INNER JOIN RECORD l ON f.userid = l.userid
WHERE f.seq_first = 1 and l.seq_last = 1
ORDER BY userid
""").show()

+------+-------------+------------+
|userid|first_channel|last_channel|
+------+-------------+------------+
|    27|      Youtube|   Instagram|
|    29|        Naver|       Naver|
|    33|       Google|     Youtube|
|    34|      Youtube|       Naver|
|    36|        Naver|     Youtube|
|    40|      Youtube|      Google|
|    41|     Facebook|     Youtube|
|    44|        Naver|   Instagram|
|    45|      Youtube|   Instagram|
|    59|    Instagram|   Instagram|
|    64|      Youtube|     Youtube|
|    65|      Youtube|     Organic|
|    68|      Youtube|     Organic|
|    69|     Facebook|   Instagram|
|    80|      Organic|       Naver|
|    84|       Google|     Youtube|
|    87|      Youtube|      Google|
|    97|      Organic|     Organic|
|   112|     Facebook|     Youtube|
|   113|      Organic|     Organic|
+------+-------------+------------+
only showing top 20 rows



### FIRST_VALUE와 LAST_VALUE를 사용해서 사용자별 처음 채널과 마지막 채널 알아내기

In [0]:
spark.sql("""
SELECT DISTINCT A.userid,
    FIRST_VALUE(A.channel) over(partition by A.userid order by B.ts
rows between unbounded preceding and unbounded following) AS First_Channel,
    LAST_VALUE(A.channel) over(partition by A.userid order by B.ts
rows between unbounded preceding and unbounded following) AS Last_Channel
FROM user_session_channel A
LEFT JOIN session_timestamp B
ON A.sessionid = B.sessionid""").show()

+------+-------------+------------+
|userid|First_Channel|Last_Channel|
+------+-------------+------------+
|    27|      Youtube|   Instagram|
|    29|        Naver|       Naver|
|    33|       Google|     Youtube|
|    34|      Youtube|       Naver|
|    36|        Naver|     Youtube|
|    40|      Youtube|      Google|
|    41|     Facebook|     Youtube|
|    44|        Naver|   Instagram|
|    45|      Youtube|   Instagram|
|    59|    Instagram|   Instagram|
|    64|      Youtube|     Youtube|
|    65|      Youtube|     Organic|
|    68|      Youtube|     Organic|
|    69|     Facebook|   Instagram|
|    80|      Organic|       Naver|
|    84|       Google|     Youtube|
|    87|      Youtube|      Google|
|    97|      Organic|     Organic|
|   112|     Facebook|     Youtube|
|   113|      Organic|     Organic|
+------+-------------+------------+
only showing top 20 rows

