# Cassandra + PySpark Batching 예제

### 1. findspark를 통해 pyspark 등 라이브러리 추가, SparkSession 생성

In [105]:
import findspark
findspark.init("/usr/local/lib/spark-3.3.2-bin-hadoop3")

from pyspark import SparkConf
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark.sql.functions import udf, col, from_json, pandas_udf, split

session = SparkSession.builder \
    .appName("Jupyter_Notebook_2") \
    .master("yarn") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2,com.datastax.spark:spark-cassandra-connector_2.12:3.3.0") \
    .config("spark.hadoop.hive.exec.dynamic.partition.mode", "nonstrict") \
    .enableHiveSupport() \
    .getOrCreate()

23/03/24 05:50:48 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
23/03/24 05:50:50 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.spark_spark-sql-kafka-0-10_2.12-3.3.2.jar added multiple times to distributed cache.
23/03/24 05:50:50 WARN Client: Same path resource file:///root/.ivy2/jars/com.datastax.spark_spark-cassandra-connector_2.12-3.3.0.jar added multiple times to distributed cache.
23/03/24 05:50:50 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.spark_spark-token-provider-kafka-0-10_2.12-3.3.2.jar added multiple times to distributed cache.
23/03/24 05:50:50 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.kafka_kafka-clients-2.8.1.jar added multiple times to distributed cache.
23/03/24 05:50:50 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.commons_commons-pool2-2.11.1.jar added multiple times to distributed cache.
23/03/24 05:5

### 2. Cassandra와 연결

In [106]:
cassandra_keyspace = "tagmanager"
cassandra_table = "stream"

batch_df = session.read \
      .format("org.apache.spark.sql.cassandra") \
  .option("checkpointLocation", "/") \
  .option("spark.cassandra.connection.host", "master01") \
  .option("spark.cassandra.connection.port", 9042) \
  .option("keyspace", cassandra_keyspace) \
  .option("table", cassandra_table) \
  .option("spark.cassandra.connection.remoteConnectionsPerExecutor", 10) \
  .option("spark.cassandra.output.concurrent.writes", 1000) \
  .option("spark.cassandra.concurrent.reads", 512) \
  .option("spark.cassandra.output.batch.grouping.buffer.size", 1000) \
  .option("spark.cassandra.connection.keep_alive_ms", 600000000) \
      .load()
batch_df.printSchema()

23/03/24 05:51:22 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
23/03/24 05:51:22 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
root
 |-- service_id: long (nullable = false)
 |-- creation_timestamp: timestamp (nullable = true)
 |-- session_id: string (nullable = true)
 |-- client_id: long (nullable = true)
 |-- event: string (nullable = true)
 |-- key: string (nullable = true)
 |-- location: string (nullable = true)
 |-- page_duration: long (nullable = true)
 |-- position_x: integer (nullable = true)
 |-- position_y: integer (nullable = true)
 |-- prev_location: string (nullable = true)
 |-- referrer: string (nullable = true)
 |-- service_token: string (nullable = true)
 |-- target_id: string (nullabl

### 3. PySqark SQL을 이용해 쿼리 작성

In [107]:
session.conf.set("spark.sql.repl.eagerEval.maxStringLength", 10000) 

In [108]:
batch_df.select('prev_location').show()


23/03/24 05:51:24 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 




+--------------------+
|       prev_location|
+--------------------+
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
|http://localhost:...|
+--------------------+
only showing top 20 rows



                                                                                

In [109]:
batch_df.show()

23/03/24 05:53:23 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
+----------+--------------------+---------------+---------+---------+--------------------+--------------------+-------------+----------+----------+--------------------+--------+--------------------+------------------+
|service_id|  creation_timestamp|     session_id|client_id|    event|                 key|            location|page_duration|position_x|position_y|       prev_location|referrer|       service_token|         target_id|
+----------+--------------------+---------------+---------+---------+--------------------+--------------------+-------------+----------+----------+--------------------+--------+--------------------+------------------+
|         2|2023-03-24 01:39:...|test-session-id|        1|    click|test-session-id-1...|http://localhost:...|         1055|       879| 

In [112]:
# distinct_df = df.groupBy("id").agg(F.first("domain").alias("domain"))
import pyspark.sql.functions as F
batch_df.select('service_id').distinct().show()

test = batch_df.groupBy("service_id").agg(F.first("page_duration").alias("page_duration")).select("service_id", "page_duration")

23/03/24 05:53:56 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
+----------+
|service_id|
+----------+
|         2|
+----------+



In [126]:

#컴퓨넌트이름, 업데이트시간, 페이지경로
session.sql("SELECT target_id, referrer, count(*) as click_count FROM batch_df GROUP BY target_id, referrer").show()
# test.show()

# from pyspark.sql.functions import count

# batch_df.groupBy("target_id", "referrer").agg(count("*").alias("카운트")).show()


AnalysisException: Table or view not found: batch_df; line 1 pos 57;
'Aggregate ['target_id, 'referrer], ['target_id, 'referrer, count(1) AS click_count#6176L]
+- 'UnresolvedRelation [batch_df], [], false


In [138]:
from datetime import datetime
from datetime import timedelta

# 간편한 between 연산을 위해 만든 유틸리티 함수
# base_time: 기준 시간
# interval: 기분 시간으로부터 얼마나 조회를 할 지의 범위
# 초, 분, 시 등의 단위
# ex. timestamp_range("2023-03-21 13:49:00", 10, 'm') => 2023-03-21 13:49:00 부터 10분 이후의 시간까지
def timestamp_range(base_time, interval, unit):
    dt_obj = datetime.strptime(base_time, '%Y-%m-%d %H:%M:%S')
    if unit=='s':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(seconds=interval))
        else:
            return (dt_obj-timedelta(seconds=-interval), dt_obj)
    if unit=='m':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(minutes=interval))
        else:
            return (dt_obj-timedelta(minutes=-interval), dt_obj)
    if unit=='H':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(hours=interval))
        else:
            return (dt_obj-timedelta(hours=-interval), dt_obj)
    if unit=='D':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(days=interval))
        else:
            return (dt_obj-timedelta(days=-interval), dt_obj)
    if unit=='M':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(months=interval))
        else:
            return (dt_obj-timedelta(months=-interval), dt_obj)
    if unit=='Y':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(years=interval))
        else:
            return (dt_obj-timedelta(years=-interval), dt_obj)


base_time = "2023-03-24 01:39:00"

# 해당 시간 사이의 모든 데이터 조회
print("해당 시간 사이의 모든 데이터 조회")
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'm'))) \
    .show()

# 해당 시간 사이에 http://localhost:3000/second에서 일어난 click 이벤트 조회

print("해당 시간 사이에 http://localhost:3000/second에서 일어난 click 이벤트 조회")
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'm'))) \
    .where(col("location") \
           .like("http://localhost:3000/second")) \
    .where(col("event") \
           .like("click")) \
    .show()

# 해당 시간 사이에 http://localhost:3000/second에 접속한 사용자 조회
print("해당 시간 사이에 http://localhost:3000/second에 접속한 사용자 조회")
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, -30, 'm'))) \
    .where(col("location") \
            .like("http://localhost:3000/second")) \
    .select("session_id").distinct() \
    .show()

# location, event 기준으로 그룹핑 후 개수 세기
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'D'))) \
    .groupBy("location", "event").count() \
    .show()

# session_id 기준으로 해당 시간동안의 서비스 체류시간 연산
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'D'))) \
    .groupBy("session_id").agg( \
        max("creation_timestamp").alias("service_leave"), \
        min("creation_timestamp").alias("service_enter") \
     ).withColumn("duration", (col("service_leave")-col("service_enter")).cast("long")) \
    .show()

# session_id 기준으로 해당 시간동안의 페이지 체류시간 연산

print("# session_id 기준으로 해당 시간동안의 페이지 체류시간 연산")
batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'D'))) \
    .groupBy("location", "session_id").agg( \
        avg("page_duration").alias("duration")*0.001
    ).show()


해당 시간 사이의 모든 데이터 조회
23/03/24 06:13:22 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
+----------+--------------------+---------------+---------+---------+--------------------+--------------------+-------------+----------+----------+--------------------+--------+--------------------+------------------+
|service_id|  creation_timestamp|     session_id|client_id|    event|                 key|            location|page_duration|position_x|position_y|       prev_location|referrer|       service_token|         target_id|
+----------+--------------------+---------------+---------+---------+--------------------+--------------------+-------------+----------+----------+--------------------+--------+--------------------+------------------+
|         2|2023-03-24 01:39:...|test-session-id|        1|    click|test-session-id-1...|http://localhost:...|      

In [139]:
batch_df.show()

23/03/24 06:13:29 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
+----------+--------------------+---------------+---------+---------+--------------------+--------------------+-------------+----------+----------+--------------------+--------+--------------------+------------------+
|service_id|  creation_timestamp|     session_id|client_id|    event|                 key|            location|page_duration|position_x|position_y|       prev_location|referrer|       service_token|         target_id|
+----------+--------------------+---------------+---------+---------+--------------------+--------------------+-------------+----------+----------+--------------------+--------+--------------------+------------------+
|         2|2023-03-24 01:39:...|test-session-id|        1|    click|test-session-id-1...|http://localhost:...|         1055|       879| 

In [158]:
from datetime import datetime
from datetime import timedelta

# 간편한 between 연산을 위해 만든 유틸리티 함수
# base_time: 기준 시간
# interval: 기분 시간으로부터 얼마나 조회를 할 지의 범위
# 초, 분, 시 등의 단위
# ex. timestamp_range("2023-03-21 13:49:00", 10, 'm') => 2023-03-21 13:49:00 부터 10분 이후의 시간까지
def timestamp_range(base_time, interval, unit):
    dt_obj = datetime.strptime(base_time, '%Y-%m-%d %H:%M:%S')
    if unit=='s':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(seconds=interval))
        else:
            return (dt_obj-timedelta(seconds=-interval), dt_obj)
    if unit=='m':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(minutes=interval))
        else:
            return (dt_obj-timedelta(minutes=-interval), dt_obj)
    if unit=='H':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(hours=interval))
        else:
            return (dt_obj-timedelta(hours=-interval), dt_obj)
    if unit=='D':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(days=interval))
        else:
            return (dt_obj-timedelta(days=-interval), dt_obj)
    if unit=='M':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(months=interval))
        else:
            return (dt_obj-timedelta(months=-interval), dt_obj)
    if unit=='Y':
        if interval>=0:
            return (dt_obj, dt_obj+timedelta(years=interval))
        else:
            return (dt_obj-timedelta(years=-interval), dt_obj)


base_time = "2023-03-24 01:39:00"


print("해당 시간 사이에 http://localhost:3000/second에서 컴포넌트 조회")

result = batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'm'))) \
    .where(col("event") \
           .like("click"))

result_df = result.groupBy(col("service_id"),col("target_id"), col("location")) \
                    .agg(count("*").alias("click_count")) \
                    .select(col("target_id"), col("location"), col("click_count"),col("service_id"))\
                    .withColumn("update_timestamp", current_timestamp())
result_df.show()


print("해당 시간 사이에 마우스 클릭")


result2_df = result.groupBy(col("service_id"),col("position_x"), col("position_y"), col("location")) \
                    .agg(count("*").alias("total_click")) \
                    .select(col("position_x"), col("position_y"), col("service_id"), col("location"))\
                    .withColumn("update_timestamp", current_timestamp())

result2_df.show()


print("해당 시간 사이에 페이지 체류")


result2_df = result.groupBy(col("service_id"),col("position_x"), col("position_y"), col("location")) \
                    .agg(count("*").alias("total_click")) \
                    .select(col("position_x"), col("position_y"), col("service_id"), col("location"))\
                    .withColumn("update_timestamp", current_timestamp())

result2_df.show()


해당 시간 사이에 http://localhost:3000/second에서 컴포넌트 조회
23/03/24 07:57:26 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
+------------------+--------------------+-----------+----------+--------------------+
|         target_id|            location|click_count|service_id|    update_timestamp|
+------------------+--------------------+-----------+----------+--------------------+
| button-first-back|http://localhost:...|          2|         2|2023-03-24 07:57:...|
|   button-to-first|http://localhost:...|          2|         2|2023-03-24 07:57:...|
|button-second-back|http://localhost:...|          4|         2|2023-03-24 07:57:...|
|button-second-view|http://localhost:...|          7|         2|2023-03-24 07:57:...|
|  button-to-second|http://localhost:...|          4|         2|2023-03-24 07:57:...|
+------------------+--------------------+-----------+-

### 4. Hive와 연결 및 INSERT
Hive 내 `test` DATABASE의 `weblogs` TABLE의 구조는 다음과 같다.
```
CREATE TABLE IF NOT EXISTS weblogs (
creation_timestamp STRING,
session_id STRING,
client_id STRING,
event STRING,
key STRING,
location STRING,
position_x STRING,
position_y STRING,
service_token STRING,
target_id STRING
) PARTITIONED BY (service_id STRING)
STORED AS ORC
LOCATION 'hdfs:///user/hive/warehouse';
```

In [None]:
hive_df = batch_df.select("*") \
    .where(col("creation_timestamp") \
            .between(*timestamp_range(base_time, 1, 'D')))

hive_df.write.mode("append") \
        .format("hive") \
        .partitionBy("service_id") \
        .saveAsTable("test.weblogs")

In [104]:
session.stop()