# Cassandra + PySpark Batching 예제

### 1. findspark를 통해 pyspark 등 라이브러리 추가, SparkSession 생성

In [1]:
import findspark
findspark.init("/usr/local/lib/spark-3.3.2-bin-hadoop3")

from pyspark import SparkConf
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark.sql.functions import udf, col, from_json, pandas_udf, split

session = SparkSession.builder \
    .appName("Jupyter_Notebook_2") \
    .master("yarn") \
    .config("spark.yarn.queue", "batch") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2,com.datastax.spark:spark-cassandra-connector_2.12:3.3.0") \
    .config("spark.hadoop.hive.exec.dynamic.partition.mode", "nonstrict") \
    .enableHiveSupport() \
    .getOrCreate()

:: loading settings :: url = jar:file:/usr/local/lib/spark-3.3.2-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
com.datastax.spark#spark-cassandra-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7954e4b4-281a-4f80-99d1-bec0d0704765;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.3.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.3.2 in central
	found org.apache.kafka#kafka-clients;2.8.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.32 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.2 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.2 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found 

23/03/27 01:02:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/27 01:02:12 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
23/03/27 01:02:15 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.spark_spark-sql-kafka-0-10_2.12-3.3.2.jar added multiple times to distributed cache.
23/03/27 01:02:15 WARN Client: Same path resource file:///root/.ivy2/jars/com.datastax.spark_spark-cassandra-connector_2.12-3.3.0.jar added multiple times to distributed cache.
23/03/27 01:02:15 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.spark_spark-token-provider-kafka-0-10_2.12-3.3.2.jar added multiple times to distributed cache.
23/03/27 01:02:15 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.kafka_kafka-clients-2.8.1.jar added multiple times to distributed cache.
23/03/27 01:02:15 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.commons_commons-pool2-2.11.1.jar added multiple times to distributed cache.
23/03/27 01:0

### 2. Cassandra와 연결

In [2]:
cassandra_keyspace = "tagmanager"
cassandra_table = "stream"

batch_df = session.read \
      .format("org.apache.spark.sql.cassandra") \
  .option("checkpointLocation", "/") \
  .option("spark.cassandra.connection.host", "master01") \
  .option("spark.cassandra.connection.port", 9042) \
  .option("keyspace", cassandra_keyspace) \
  .option("table", cassandra_table) \
  .option("spark.cassandra.connection.remoteConnectionsPerExecutor", 10) \
  .option("spark.cassandra.output.concurrent.writes", 1000) \
  .option("spark.cassandra.concurrent.reads", 512) \
  .option("spark.cassandra.output.batch.grouping.buffer.size", 1000) \
  .option("spark.cassandra.connection.keep_alive_ms", 600000000) \
      .load()
batch_df.printSchema()

23/03/27 01:02:30 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
23/03/27 01:02:31 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
root
 |-- service_id: long (nullable = false)
 |-- creation_timestamp: timestamp (nullable = true)
 |-- session_id: string (nullable = true)
 |-- client_id: long (nullable = true)
 |-- event: string (nullable = true)
 |-- key: string (nullable = true)
 |-- location: string (nullable = true)
 |-- page_duration: long (nullable = true)
 |-- position_x: integer (nullable = true)
 |-- position_y: integer (nullable = true)
 |-- prev_location: string (nullable = true)
 |-- referrer: string (nullable = true)
 |-- service_token: string (nullable = true)
 |-- target_id: string (nullabl

### 3. PySqark SQL을 이용해 쿼리 작성

In [7]:
from datetime import datetime
from datetime import timedelta

# 간편한 between 연산을 위해 만든 유틸리티 함수
# base_time: 기준 시간
# interval: 기분 시간으로부터 얼마나 조회를 할 지의 범위
# 초, 분, 시 등의 단위
# ex. timestamp_range("2023-03-21 13:49:00", 10, 'm') => 2023-03-21 13:49:00 부터 10분 이후의 시간까지
def timestamp_range(base_time, interval, unit):
    dt_obj = datetime.strptime(base_time, '%Y-%m-%d %H:%M:%S')
    if unit=='s':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(seconds=interval))
        else:
            return (dt_obj-timedelta(seconds=-interval), dt_obj)
    if unit=='m':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(minutes=interval))
        else:
            return (dt_obj-timedelta(minutes=-interval), dt_obj)
    if unit=='H':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(hours=interval))
        else:
            return (dt_obj-timedelta(hours=-interval), dt_obj)
    if unit=='D':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(days=interval))
        else:
            return (dt_obj-timedelta(days=-interval), dt_obj)
    if unit=='M':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(months=interval))
        else:
            return (dt_obj-timedelta(months=-interval), dt_obj)
    if unit=='Y':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(years=interval))
        else:
            return (dt_obj-timedelta(years=-interval), dt_obj)


base_time = "2023-03-27 01:00:00"

# 해당 시간 사이의 모든 데이터 조회
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 5, 'm'))) \
    .show()

# 해당 시간 사이에 http://localhost:3000/second에서 일어난 click 이벤트 조회
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 5, 'm'))) \
    .where(col("location") \
           .like("http://localhost:3000/second")) \
    .where(col("event") \
           .like("click")) \
    .show()

# 해당 시간 사이에 http://localhost:3000/second에 접속한 사용자 조회
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, -30, 'm'))) \
    .where(col("location") \
            .like("http://localhost:3000/second")) \
    .select("session_id").distinct() \
    .show()

# location, event 기준으로 그룹핑 후 개수 세기
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'D'))) \
    .groupBy("location", "event").count() \
    .show()

# session_id 기준으로 해당 시간동안의 서비스 체류시간 연산
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'D'))) \
    .groupBy("session_id").agg( \
        max("creation_timestamp").alias("service_leave"), \
        min("creation_timestamp").alias("service_enter") \
     ).withColumn("duration", (col("service_leave")-col("service_enter")).cast("long")) \
    .show()

# session_id 기준으로 해당 시간동안의 페이지 체류시간 연산
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'D'))) \
    .groupBy("location", "session_id").agg( \
        avg("page_duration").alias("duration")*0.001
    ).show()


23/03/27 01:04:24 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
+----------+--------------------+---------------+---------+---------+--------------------+--------------------+-------------+----------+----------+--------------------+--------+--------------------+------------------+
|service_id|  creation_timestamp|     session_id|client_id|    event|                 key|            location|page_duration|position_x|position_y|       prev_location|referrer|       service_token|         target_id|
+----------+--------------------+---------------+---------+---------+--------------------+--------------------+-------------+----------+----------+--------------------+--------+--------------------+------------------+
|         2|2023-03-27 01:01:...|test-session-id|        1|pageleave|test-session-id-1...|http://localhost:...|         2769|         0| 

### 4. Hive와 연결 및 INSERT
Hive 내 `test` DATABASE의 `weblogs` TABLE의 구조는 다음과 같다.
```
CREATE TABLE IF NOT EXISTS weblogs (
creation_timestamp STRING,
session_id STRING,
client_id STRING,
event STRING,
key STRING,
location STRING,
position_x STRING,
position_y STRING,
service_token STRING,
target_id STRING
) PARTITIONED BY (service_id STRING)
STORED AS ORC
LOCATION 'hdfs:///user/hive/warehouse';
```

### 주의:
`InsertInto`를 통해 테이블에 데이터를 넣기 위해선 정의된 테이블의 컬럼 순서와 Datadrame의 컬럼 순서가 같아야 한다.  
예시로, component 테이블에 데이터를 넣기 위해선 다음과 같은 SELECT문으로 컬럼의 순서를 조정해주어야 한다.
```
CREATE TABLE IF NOT EXISTS mata.components(
  total_click INT,
  target_id STRING,
  location STRING,
  update_timestamp TIMESTAMP,
  service_id BIGINT,
  CONSTRAINT fk_components_service_id FOREIGN KEY(service_id) REFERENCES mata.services(service_id) DISABLE NOVALIDATE 
) CLUSTERED BY (service_id) SORTED BY (update_timestamp DESC) INTO 10 BUCKETS
STORED AS ORC;
```
컬럼 순서: ("total_click", "target_id", "location", "update_timestamp", "service_id")

In [74]:
#components
components_df = batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'D'))) \
    .where(col("event").like("click")) \
    .groupBy("service_id", "target_id", "location").agg( \
        count("key").alias("total_click"), \
    ).withColumn("update_timestamp", current_timestamp()) \
    .select("total_click", "target_id", "location", "update_timestamp", "service_id")
components_df.show()


#click
click_df = batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'D'))) \
    .where(col("event").like("click")) \
    .groupBy("service_id", "position_x", "position_y", "location").agg( \
        count("key").alias("total_click"), \
    ).withColumn("update_timestamp", current_timestamp()) \
    .select("total_click", "position_x", "position_y","location", "update_timestamp", "service_id")
click_df.show()


#page_durations
page_durations_df = batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'D'))) \
    .groupBy("service_id", "location", "service_id").agg(\
        count("*").alias("total_session"),\
        sum("page_duration").alias("total_duration"),\
     ).withColumn("update_timestamp", current_timestamp()) \
    .select("total_duration","total_session","location", "update_timestamp","service_id")
page_durations_df.show()


#page_journal
page_journal_df = batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'D'))) \
    .groupBy("prev_location", "location", "service_id").agg(\
        count("*").alias("total_journal"),\
     ).withColumn("update_timestamp", current_timestamp()) \
    .select("total_journal",col("prev_location").alias("location_from"),col("location").alias("location_to"), "update_timestamp","service_id")
page_journal_df.show()

23/03/27 04:54:59 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
+-----------+------------------+--------------------+--------------------+----------+
|total_click|         target_id|            location|    update_timestamp|service_id|
+-----------+------------------+--------------------+--------------------+----------+
|          4| button-first-back|http://localhost:...|2023-03-27 04:54:...|         2|
|          4|   button-to-first|http://localhost:...|2023-03-27 04:54:...|         2|
|          7|button-second-back|http://localhost:...|2023-03-27 04:54:...|         2|
|         12|button-second-view|http://localhost:...|2023-03-27 04:54:...|         2|
|          6|  button-to-second|http://localhost:...|2023-03-27 04:54:...|         2|
+-----------+------------------+--------------------+--------------------+----------+

23/03/27 04:55:0

In [75]:
components_df.write.mode("append") \
        .format("hive") \
        .insertInto("mata.components")

click_df.write.mode("append") \
        .format("hive") \
        .insertInto("mata.clicks")

page_durations_df.write.mode("append") \
        .format("hive") \
        .insertInto("mata.page_durations")

page_journal_df.write.mode("append") \
        .format("hive") \
        .insertInto("mata.page_journal")

23/03/27 04:55:32 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
23/03/27 04:55:33 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
23/03/27 04:55:33 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
23/03/27 04:55:33 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 


In [None]:
session.stop()