# Cassandra + PySpark Batching 예제

### 1. findspark를 통해 pyspark 등 라이브러리 추가, SparkContext 생성

In [4]:
import findspark
findspark.init("/usr/local/lib/spark-3.3.2-bin-hadoop3")

from pyspark import SparkConf
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark.sql.functions import udf, col, from_json, pandas_udf, split

sconf = SparkConf()
sconf.setAppName("Jupyter_Notebook_2").set("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2,com.datastax.spark:spark-cassandra-connector_2.12:3.3.0")

sc = SparkContext(conf=sconf)

23/03/21 13:52:37 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
23/03/21 13:52:43 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.spark_spark-sql-kafka-0-10_2.12-3.3.2.jar added multiple times to distributed cache.
23/03/21 13:52:43 WARN Client: Same path resource file:///root/.ivy2/jars/com.datastax.spark_spark-cassandra-connector_2.12-3.3.0.jar added multiple times to distributed cache.
23/03/21 13:52:43 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.spark_spark-token-provider-kafka-0-10_2.12-3.3.2.jar added multiple times to distributed cache.
23/03/21 13:52:43 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.kafka_kafka-clients-2.8.1.jar added multiple times to distributed cache.
23/03/21 13:52:43 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.commons_commons-pool2-2.11.1.jar added multiple times to distributed cache.
23/03/21 13:5

### 2. SparkSession 시작, Cassandra와 연결

In [7]:
session = SparkSession(sc)

cassandra_keyspace = "tagmanager"
cassandra_table = "stream"

batch_df = session.read \
      .format("org.apache.spark.sql.cassandra") \
  .option("checkpointLocation", "/") \
  .option("spark.cassandra.connection.host", "master01") \
  .option("spark.cassandra.connection.port", 9042) \
  .option("keyspace", cassandra_keyspace) \
  .option("table", cassandra_table) \
  .option("spark.cassandra.connection.remoteConnectionsPerExecutor", 10) \
  .option("spark.cassandra.output.concurrent.writes", 1000) \
  .option("spark.cassandra.concurrent.reads", 512) \
  .option("spark.cassandra.output.batch.grouping.buffer.size", 1000) \
  .option("spark.cassandra.connection.keep_alive_ms", 600000000) \
      .load()
batch_df.printSchema()

23/03/21 13:53:48 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
23/03/21 13:53:48 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
root
 |-- service_id: long (nullable = false)
 |-- creation_timestamp: timestamp (nullable = true)
 |-- session_id: string (nullable = true)
 |-- client_id: long (nullable = true)
 |-- event: string (nullable = true)
 |-- key: string (nullable = true)
 |-- location: string (nullable = true)
 |-- position_x: integer (nullable = true)
 |-- position_y: integer (nullable = true)
 |-- service_token: string (nullable = true)
 |-- target_id: string (nullable = true)



### 3. PySqark SQL을 이용해 쿼리 작성

In [50]:
from datetime import datetime
from datetime import timedelta

# 간편한 between 연산을 위해 만든 유틸리티 함수
# base_time: 기준 시간
# interval: 기분 시간으로부터 얼마나 조회를 할 지의 범위
# 초, 분, 시 등의 단위
# ex. timestamp_range("2023-03-21 13:49:00", 10, 'm') => 2023-03-21 13:49:00 부터 10분 이후의 시간까지
def timestamp_range(base_time, interval, unit):
    if unit=='s':
        return (datetime.strptime(base_time, '%Y-%m-%d %H:%M:%S'), datetime.strptime(base_time, '%Y-%m-%d %H:%M:%S')+timedelta(seconds=interval))
    if unit=='m':
        return (datetime.strptime(base_time, '%Y-%m-%d %H:%M:%S'), datetime.strptime(base_time, '%Y-%m-%d %H:%M:%S')+timedelta(minutes=interval))
    if unit=='H':
        return (datetime.strptime(base_time, '%Y-%m-%d %H:%M:%S'), datetime.strptime(base_time, '%Y-%m-%d %H:%M:%S')+timedelta(hours=interval)) 
    if unit=='D':
        return (datetime.strptime(base_time, '%Y-%m-%d %H:%M:%S'), datetime.strptime(base_time, '%Y-%m-%d %H:%M:%S')+timedelta(days=interval)) 
    if unit=='M':
        return (datetime.strptime(base_time, '%Y-%m-%d %H:%M:%S'), datetime.strptime(base_time, '%Y-%m-%d %H:%M:%S')+timedelta(months=interval))
    if unit=='Y':
        return (datetime.strptime(base_time, '%Y-%m-%d %H:%M:%S'), datetime.strptime(base_time, '%Y-%m-%d %H:%M:%S')+timedelta(years=interval))


base_time = "2023-03-21 13:49:00"


# 해당 시간 사이의 모든 데이터 조회
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'm'))) \
    .show()

# 해당 시간 사이에 http://localhost:3000/second에서 일어난 click 이벤트 조회
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'm'))) \
    .where(col("location") \
           .like("http://localhost:3000/second")) \
    .where(col("event") \
           .like("click")) \
    .show()

# 해당 시간 사이에 http://localhost:3000/second에서 일어난 click 이벤트 조회
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'm'))) \
    .distinct("") \
    .show()

# location, event 기준으로 그룹핑 후 개수 세기
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'D'))) \
    .groupBy("location", "event").count() \
    .show()

23/03/21 14:53:28 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
+----------+--------------------+---------------+---------+---------+--------------------+--------------------+----------+----------+--------------------+------------------+
|service_id|  creation_timestamp|     session_id|client_id|    event|                 key|            location|position_x|position_y|       service_token|         target_id|
+----------+--------------------+---------------+---------+---------+--------------------+--------------------+----------+----------+--------------------+------------------+
|         2|2023-03-21 13:49:...|test-session-id|        1|pageleave|test-session-id-1...|http://localhost:...|         0|         0|tag-manager-servi...|              none|
|         2|2023-03-21 13:49:...|test-session-id|        1|    click|test-session-id-1...|http:

                                                                                

+----------+--------------------+---------------+---------+-----+--------------------+--------------------+----------+----------+--------------------+------------------+
|service_id|  creation_timestamp|     session_id|client_id|event|                 key|            location|position_x|position_y|       service_token|         target_id|
+----------+--------------------+---------------+---------+-----+--------------------+--------------------+----------+----------+--------------------+------------------+
|         2|2023-03-21 13:49:...|test-session-id|        1|click|test-session-id-1...|http://localhost:...|       940|       691|tag-manager-servi...|button-second-back|
|         2|2023-03-21 13:49:...|test-session-id|        1|click|test-session-id-1...|http://localhost:...|       960|       708|tag-manager-servi...|button-second-back|
|         2|2023-03-21 13:49:...|test-session-id|        1|click|test-session-id-1...|http://localhost:...|       980|       641|tag-manager-servi...|



+--------------------+---------+-----+
|            location|    event|count|
+--------------------+---------+-----+
|http://localhost:...|pageleave|    3|
|http://localhost:...|pageenter|    4|
|http://localhost:...|pageenter|    1|
|http://localhost:...|    click|    5|
|http://localhost:...|pageenter|    2|
|http://localhost:...|    click|   20|
|http://localhost:...|pageleave|    6|
|http://localhost:...|    click|    2|
|http://localhost:...|pageleave|    2|
+--------------------+---------+-----+



                                                                                

In [3]:
sc.stop()