# Kafka Streaming + PySpark 예제

### 1. findspark를 통해 pyspark 등 라이브러리 추가

In [12]:
import findspark
findspark.init("/usr/local/lib/spark-3.3.2-bin-hadoop3")

### 2. SparkConf를 통해 configuration 추가하고, SparkContext 생성
spark-kafka와 spark-cassandra 의존성이 추가되어야 한다.

In [None]:
from pyspark import SparkConf
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark.sql.functions import udf
from pyspark.sql.functions import col, pandas_udf, split

sconf = SparkConf()
sconf.setAppName("Jupyter_Notebook").set("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2,com.datastax.spark:spark-cassandra-connector_2.12:3.3.0")

sc = SparkContext(conf=sconf)

23/03/10 07:27:09 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
23/03/10 07:27:13 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.spark_spark-sql-kafka-0-10_2.12-3.3.2.jar added multiple times to distributed cache.
23/03/10 07:27:13 WARN Client: Same path resource file:///root/.ivy2/jars/com.datastax.spark_spark-cassandra-connector_2.12-3.3.0.jar added multiple times to distributed cache.
23/03/10 07:27:13 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.spark_spark-token-provider-kafka-0-10_2.12-3.3.2.jar added multiple times to distributed cache.
23/03/10 07:27:13 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.kafka_kafka-clients-2.8.1.jar added multiple times to distributed cache.
23/03/10 07:27:13 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.commons_commons-pool2-2.11.1.jar added multiple times to distributed cache.
23/03/10 07:2

### 3. Kafka Topic에 sobscribe하여 Session의 readStream을 정의
printSchema() 메소드를 통해 Kafka의 스키마를 확인할 수 있다.

In [None]:
kafka_bootstrap_servers = 'slave03:9092'
topic = 'quickstart-events'

session = SparkSession(sc)
streaming_df = session \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
  .option("failOnDataLoss","False") \
  .option("subscribe", topic) \
  .load()
streaming_df.printSchema()

### 4. Binary 형태인 key, value를 String으로 cast하여 전처리
key는 null값을 가지고 있어, 임의로 interger로 cast한 timestamp를 넣어주었다.

In [None]:
import time
from pyspark.sql.functions import col

cassandra_keyspace = "mykeyspace"
cassandra_table = "users"

key = 0

streamming_query = streaming_df.select("*") \
                                    .withColumn("value", col("value").cast("string")) \
                                    .withColumn("key", col("timestamp").cast("int"))



### 5. Spark Cassandra Connector를 사용해 Cassandra 연결, Streaming되는 dataframe 출력
20초 간 INSERT 후에 자동으로 writeStream 쿼리를 종료한다. Cassandra의 keyspace, table은 그 형태가 미리 정의되어있어야 한다.

In [None]:
# query = streamming_query.writeStream.format("console").start()

query = streamming_query.writeStream.outputMode("append") \
      .format("org.apache.spark.sql.cassandra") \
  .option("checkpointLocation", "/") \
  .option("spark.cassandra.connection.host", "master01") \
  .option("spark.cassandra.connection.port", 9042) \
  .option("keyspace", cassandra_keyspace) \
  .option("table", cassandra_table) \
  .option("spark.cassandra.connection.remoteConnectionsPerExecutor", 10) \
  .option("spark.cassandra.output.concurrent.writes", 1000) \
  .option("spark.cassandra.concurrent.reads", 512) \
  .option("spark.cassandra.output.batch.grouping.buffer.size", 1000) \
  .option("spark.cassandra.connection.keep_alive_ms", 600000000) \
      .start()

time.sleep(20)
query.stop()

### 6. Session과 Context 종료

In [11]:
session.stop()
sc.stop()