# Kafka Streaming + PySpark 예제

### 1. findspark를 통해 pyspark 등 라이브러리 추가

In [22]:
import findspark
findspark.init("/usr/local/lib/spark-3.3.2-bin-hadoop3")

### 2. SparkConf를 통해 configuration 추가하고, SparkContext 생성
spark-kafka와 spark-cassandra 의존성이 추가되어야 한다.

In [70]:
from pyspark import SparkConf
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark.sql.functions import udf
from pyspark.sql.functions import col, pandas_udf, split

sconf = SparkConf()
sconf.setAppName("Jupyter_Notebook").set("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2,com.datastax.spark:spark-cassandra-connector_2.12:3.3.0")

sc = SparkContext(conf=sconf)

23/03/19 11:00:59 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
23/03/19 11:01:02 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.spark_spark-sql-kafka-0-10_2.12-3.3.2.jar added multiple times to distributed cache.
23/03/19 11:01:02 WARN Client: Same path resource file:///root/.ivy2/jars/com.datastax.spark_spark-cassandra-connector_2.12-3.3.0.jar added multiple times to distributed cache.
23/03/19 11:01:02 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.spark_spark-token-provider-kafka-0-10_2.12-3.3.2.jar added multiple times to distributed cache.
23/03/19 11:01:02 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.kafka_kafka-clients-2.8.1.jar added multiple times to distributed cache.
23/03/19 11:01:02 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.commons_commons-pool2-2.11.1.jar added multiple times to distributed cache.
23/03/19 11:0

### 3. Kafka Topic에 sobscribe하여 Session의 readStream을 정의
printSchema() 메소드를 통해 Kafka의 스키마를 확인할 수 있다.

In [71]:
from pyspark.sql.functions import from_json, col

kafka_bootstrap_servers = 'master01:9092,master02:9092,slave01:9092,slave02:9092,slave03:9092'
topic = 'tagmanager'
schema = StructType(
        [
                StructField("serviceToken", StringType()),
                StructField("clientId", LongType()),
                StructField("sessionId", StringType()),
                StructField("event", StringType()),
                StructField("targetId", StringType()),
                StructField("positionX", IntegerType()),
                StructField("positionY", IntegerType()),
                StructField("location", StringType()),
                StructField("timestamp", LongType())
        ]
)

session = SparkSession(sc)
streaming_df = session \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
  .option("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer") \
  .option("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer") \
  .option("failOnDataLoss","False") \
  .option("subscribe", topic) \
  .load() \
  .withColumn("key", col("key").cast("string")) \
  .withColumn("value", from_json(col("value").cast("string"), schema))
streaming_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- serviceToken: string (nullable = true)
 |    |-- clientId: long (nullable = true)
 |    |-- sessionId: string (nullable = true)
 |    |-- event: string (nullable = true)
 |    |-- targetId: string (nullable = true)
 |    |-- positionX: integer (nullable = true)
 |    |-- positionY: integer (nullable = true)
 |    |-- location: string (nullable = true)
 |    |-- timestamp: long (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



### 4. Binary 형태인 key, value를 String으로 cast하여 전처리
key는 null값을 가지고 있어, 임의로 interger로 cast한 timestamp를 넣어주었다.

In [72]:
import time
from pyspark.sql.functions import col

cassandra_keyspace = "tagmanager"
cassandra_table = "stream"


streamming_query = streaming_df.select("key", "value.*") \
    .withColumnRenamed("serviceToken", "service_token") \
    .withColumnRenamed("clientId", "client_id") \
    .withColumnRenamed("sessionId", "session_id") \
    .withColumnRenamed("event", "event") \
    .withColumnRenamed("targetId", "target_id") \
    .withColumnRenamed("positionX", "position_x") \
    .withColumnRenamed("positionY", "position_y") \
    .withColumnRenamed("location", "location") \
    .withColumnRenamed("timestamp", "creation_timestamp")

In [86]:
query = streamming_query.writeStream.outputMode("complete").format("console").start()
time.sleep(10)
query.stop()

23/03/19 11:05:40 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-cbfed48a-9027-463b-b313-4da584bec5c8. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/03/19 11:05:40 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


AnalysisException: Complete output mode not supported when there are no streaming aggregations on streaming DataFrames/Datasets;
Project [key#2741, service_token#2776, client_id#2787L, session_id#2798, event#2809, target_id#2820, position_x#2831, position_y#2842, location#2853, timestamp#2765L AS creation_timestamp#2864L]
+- Project [key#2741, service_token#2776, client_id#2787L, session_id#2798, event#2809, target_id#2820, position_x#2831, position_y#2842, location#2764 AS location#2853, timestamp#2765L]
   +- Project [key#2741, service_token#2776, client_id#2787L, session_id#2798, event#2809, target_id#2820, position_x#2831, positionY#2763 AS position_y#2842, location#2764, timestamp#2765L]
      +- Project [key#2741, service_token#2776, client_id#2787L, session_id#2798, event#2809, target_id#2820, positionX#2762 AS position_x#2831, positionY#2763, location#2764, timestamp#2765L]
         +- Project [key#2741, service_token#2776, client_id#2787L, session_id#2798, event#2809, targetId#2761 AS target_id#2820, positionX#2762, positionY#2763, location#2764, timestamp#2765L]
            +- Project [key#2741, service_token#2776, client_id#2787L, session_id#2798, event#2760 AS event#2809, targetId#2761, positionX#2762, positionY#2763, location#2764, timestamp#2765L]
               +- Project [key#2741, service_token#2776, client_id#2787L, sessionId#2759 AS session_id#2798, event#2760, targetId#2761, positionX#2762, positionY#2763, location#2764, timestamp#2765L]
                  +- Project [key#2741, service_token#2776, clientId#2758L AS client_id#2787L, sessionId#2759, event#2760, targetId#2761, positionX#2762, positionY#2763, location#2764, timestamp#2765L]
                     +- Project [key#2741, serviceToken#2757 AS service_token#2776, clientId#2758L, sessionId#2759, event#2760, targetId#2761, positionX#2762, positionY#2763, location#2764, timestamp#2765L]
                        +- Project [key#2741, value#2749.serviceToken AS serviceToken#2757, value#2749.clientId AS clientId#2758L, value#2749.sessionId AS sessionId#2759, value#2749.event AS event#2760, value#2749.targetId AS targetId#2761, value#2749.positionX AS positionX#2762, value#2749.positionY AS positionY#2763, value#2749.location AS location#2764, value#2749.timestamp AS timestamp#2765L]
                           +- Project [key#2741, from_json(StructField(serviceToken,StringType,true), StructField(clientId,LongType,true), StructField(sessionId,StringType,true), StructField(event,StringType,true), StructField(targetId,StringType,true), StructField(positionX,IntegerType,true), StructField(positionY,IntegerType,true), StructField(location,StringType,true), StructField(timestamp,LongType,true), cast(value#2728 as string), Some(Etc/UTC)) AS value#2749, topic#2729, partition#2730, offset#2731L, timestamp#2732, timestampType#2733]
                              +- Project [cast(key#2727 as string) AS key#2741, value#2728, topic#2729, partition#2730, offset#2731L, timestamp#2732, timestampType#2733]
                                 +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@2faa5d18, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@216db69d, [key.deserializer=org.apache.kafka.common.serialization.StringDeserializer, subscribe=tagmanager, failOnDataLoss=False, value.deserializer=org.apache.kafka.common.serialization.StringDeserializer, kafka.bootstrap.servers=master01:9092,master02:9092,slave01:9092,slave02:9092,slave03:9092], [key#2727, value#2728, topic#2729, partition#2730, offset#2731L, timestamp#2732, timestampType#2733], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@25523668,kafka,List(),None,List(),None,Map(key.deserializer -> org.apache.kafka.common.serialization.StringDeserializer, subscribe -> tagmanager, failOnDataLoss -> False, value.deserializer -> org.apache.kafka.common.serialization.StringDeserializer, kafka.bootstrap.servers -> master01:9092,master02:9092,slave01:9092,slave02:9092,slave03:9092),None), kafka, [key#2720, value#2721, topic#2722, partition#2723, offset#2724L, timestamp#2725, timestampType#2726]


### 5. Spark Cassandra Connector를 사용해 Cassandra 연결, Streaming되는 dataframe 출력
20초 간 INSERT 후에 자동으로 writeStream 쿼리를 종료한다. Cassandra의 keyspace, table은 그 형태가 미리 정의되어있어야 한다.

In [80]:
# query = streamming_query.writeStream.format("console").start()

query = streamming_query.writeStream.outputMode("append") \
      .format("org.apache.spark.sql.cassandra") \
  .option("checkpointLocation", "/") \
  .option("spark.cassandra.connection.host", "master01") \
  .option("spark.cassandra.connection.port", 9042) \
  .option("keyspace", cassandra_keyspace) \
  .option("table", cassandra_table) \
  .option("spark.cassandra.connection.remoteConnectionsPerExecutor", 10) \
  .option("spark.cassandra.output.concurrent.writes", 1000) \
  .option("spark.cassandra.concurrent.reads", 512) \
  .option("spark.cassandra.output.batch.grouping.buffer.size", 1000) \
  .option("spark.cassandra.connection.keep_alive_ms", 600000000) \
      .start()

time.sleep(10)
query.stop()

23/03/19 11:04:26 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
23/03/19 11:04:26 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
23/03/19 11:04:26 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/03/19 11:04:26 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
23/03/19 11:04:26 WARN DeprecatedConfigParameter: spark.cassandra.connection.keep_alive_ms is deprecated (DSE 6.0.0) and has been automatically replaced with parameter spark.cassandra.connection.keepAliveMS. 
23/03/19 11:04:26 WA

[Stage 10:>                                                         (0 + 1) / 1]

23/03/19 11:04:29 ERROR TaskSetManager: Task 0 in stage 10.0 failed 4 times; aborting job
23/03/19 11:04:29 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@4bfc4528 is aborting.
23/03/19 11:04:29 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@4bfc4528 aborted.
23/03/19 11:04:29 ERROR MicroBatchExecution: Query [id = cc0a5d04-9df5-41eb-a7d9-731bd39fece0, runId = bd757545-178a-424f-8bf1-431cce50a356] terminated with error
org.apache.spark.SparkException: Writing job aborted
	at org.apache.spark.sql.errors.QueryExecutionErrors$.writingJobAbortedError(QueryExecutionErrors.scala:767)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:409)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2$(WriteToDataSourceV2Exec.scala:353)
	at org.apache.spark.sql.execution.d

### 6. Session과 Context 종료

In [1]:
session.stop()
sc.stop()

NameError: name 'sc' is not defined