In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 8 --executor-memory 4g --executor-cores 2 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import keyword_only

from pyspark.ml import Transformer, Pipeline
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover, CountVectorizer, VectorAssembler
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier


from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, DoubleType, FloatType, ArrayType, StringType, IntegerType, LongType

from pyspark.sql.window import Window
from pyspark.sql.functions import udf, col, when, isnan, isnull, broadcast, desc, lower, pandas_udf, row_number, explode, split
from pyspark.sql.functions import array, collect_set, lit, from_json, to_json, struct, regexp_replace

from pyspark.mllib.linalg import SparseVector, DenseVector



import json
import re

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[2]") \
                    .appName("KAM_laba4") \
                    .config("spark.driver.memory", "512m") \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5") \
                    .getOrCreate()

spark

In [4]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_artem.kropis",
    "startingOffsets": "latest"
}
kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

In [5]:
kafka_sdf.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [6]:
def create_console_sink(df):
    return df \
            .writeStream \
            .format("console") \
            .trigger(processingTime="5 seconds") \
            .option("truncate", "false") \
            .option("numRows", "20")

In [7]:
parsed_sdf = kafka_sdf.select(col("value").cast("string"), col("topic"), col("partition"), col("offset"))

sink = create_console_sink(parsed_sdf)

sq = sink.start()

In [11]:
sq.isActive

True

In [14]:
sq.stop()

KeyboardInterrupt: 

In [21]:
kafka_read_df = (
        spark.read
        .format('kafka')
        .option("kafka.bootstrap.servers", 'spark-master-1.newprolab.com:6667')
        .option("subscribe", "input_artem.kropis")
#         .option("startingOffsets", "latest")
        .option("failOnDataloss", "False")
        .load()
        .cache()
)

In [53]:
kafka_read_df.createOrReplaceTempView("temp")

In [23]:
spark.sql("""SELECT count(distinct value)
            FROM temp
            LIMIT 10""").toPandas()

Unnamed: 0,count(DISTINCT value)
0,5000


In [54]:
df = (kafka_read_df
      .selectExpr('CAST(value AS STRING)', 'offset'))

In [4]:
event_schema = StructType([
        StructField('uid', StringType(), True)
        , StructField('visits', StringType(),True)
        ])

visit_schema = ArrayType(
        StructType([
            StructField('url', StringType(), True)
            , StructField('timestamp', LongType(), True)
        ])
)

In [55]:
clean_df = (
        df
        .select(from_json("value", event_schema).alias("event"))
        .select("event.uid"
        , from_json(col("event.visits"), visit_schema).alias("visits"))
)

In [56]:
clean_df = clean_df.select("uid", explode(clean_df.visits))

In [57]:
clean_df = clean_df.selectExpr("uid", "parse_url(col.url, 'HOST') as host")
# clean_df = clean_df.filter(clean_df.host.isNotNull())

In [58]:
clean_df = clean_df\
    .withColumn('domain', regexp_replace('host', 'www.', ''))\
    .select('uid', 'domain')

In [59]:
clean_df = clean_df.groupBy("uid").agg(collect_set('domain').alias('domain'))

In [60]:
clean_df_vector = hasher_freq.transform(clean_df)
clean_df_norm = normalizer.transform(clean_df_vector)

In [61]:
clean_df_norm_1 = lr_gendr_model.transform(clean_df_norm)
clean_df_norm_2 = rf_age_model.transform(clean_df_norm)

In [62]:
clean_df_norm_1.select("uid", "prediction").createOrReplaceTempView("gender")
clean_df_norm_2.select("uid", "prediction").createOrReplaceTempView("age")

In [63]:
out_df = spark.sql("""SELECT g.uid
                , CAST(CAST(g.prediction AS INT) AS String) AS gender
                , CAST(CAST(a.prediction AS INT) AS String) AS age
            FROM gender g
            JOIN age a
                ON g.uid = a.uid""")

In [444]:
# out_df = spark.sql("""SELECT g.uid
#                 , 'F' AS gender
#                 , '25-34' AS age
#             FROM gender g
#             JOIN age a
#                 ON g.uid = a.uid""")

In [64]:
out_df = out_df.replace(['0','1'], ['F', 'M'], "gender") \
                     .replace(['1','3','5','7','9'], ['18-24', '25-34', '35-44', '45-54', '>=55'], "age")

In [65]:
pDF = out_df.select(lit("").alias('key'), to_json(struct(*out_df.columns)).alias('value'))

In [66]:
(
    pDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
        .write
        .format("kafka")
#         .outputMode("append")
#         .option("checkpointLoacation", "chekpoints/kam_checkpoints_lab04")
#         .option("kafka.batch.size", 5000)
        .option("kafka.bootstrap.servers", "spark-master-1.newprolab.com:6667")
        .option("topic", "artem.kropis")
        .save()
)

In [320]:
out_df.show(5)

+--------------------+---+---+
|                 uid| pg| pa|
+--------------------+---+---+
|0108d217-e476-493...|  0|  3|
|0192cc54-559c-4c8...|  0|  3|
|019acd5e-be9a-4cd...|  1|  3|
|02e7f830-da57-4d5...|  1|  3|
|1d160259-73d8-451...|  1|  3|
+--------------------+---+---+
only showing top 5 rows



In [325]:
out_df_test.show(5)

+--------------------+-----+-----+
|                 uid|gendr|  age|
+--------------------+-----+-----+
|0108d217-e476-493...|    F|25-34|
|0192cc54-559c-4c8...|    F|25-34|
|019acd5e-be9a-4cd...|    M|25-34|
|02e7f830-da57-4d5...|    M|25-34|
|1d160259-73d8-451...|    M|25-34|
+--------------------+-----+-----+
only showing top 5 rows



In [306]:
clean_df_norm_1.select("uid", "prediction").show(2, False, True)

-RECORD 0------------------------------------------
 uid        | 0108d217-e476-493d-8c81-a9744f12451a 
 prediction | 0.0                                  
-RECORD 1------------------------------------------
 uid        | 0192cc54-559c-4c8e-89b4-5f4bf31e4245 
 prediction | 0.0                                  
only showing top 2 rows



In [297]:
clean_df_norm_1 = lr_gendr_model.transform(clean_df_norm)
clean_df_norm_2 = rf_age_model.transform(clean_df_norm)

In [305]:
clean_df_norm_2.select("uid", "prediction").show(2, False, True)

-RECORD 0------------------------------------------
 uid        | 0108d217-e476-493d-8c81-a9744f12451a 
 prediction | 3.0                                  
-RECORD 1------------------------------------------
 uid        | 0192cc54-559c-4c8e-89b4-5f4bf31e4245 
 prediction | 3.0                                  
only showing top 2 rows



In [282]:
clean_df.createOrReplaceTempView('test')

In [285]:
spark.sql(f"""SELECT *
            FROM test
            limit 10""").toPandas()

Unnamed: 0,uid,visits
0,bd7a30e1-a25d-4cbf-a03f-61748cbe540e,"[(http://www.interfax.ru/business/414668, 1419..."
1,bd7a6f52-45db-49bf-90f2-a3b07a9b7bcd,"[(https://www.packagetrackr.com/track/ups, 141..."
2,bd7a7fd9-ab06-42f5-bf0f-1cbb0463004c,[(http://www.mk.ru/incident/2015/02/27/latviya...
3,bd7c5d7a-0def-41d1-895f-fdb96c56c2d4,"[(http://www.24open.ru/user/elena8020445/, 142..."
4,bd7e54a2-0215-45cb-a869-9efebf250e38,[(http://www.dns-shop.ru/catalog/i172806/tverd...
5,bd7e9797-4cdb-46e1-a540-f3ea010605ad,[(http://news.meta.ua/cluster:41878362-v-sevas...
6,bd7e9ec7-fb67-45eb-8ad3-209d01d15ae6,"[(http://dynamobryansk.forum24.ru/, 1427212135..."
7,bd8056df-cc25-4b63-bc12-a46f888baa49,[(http://www.2mm.ru/mzdorovie/634/gerpes-vooru...
8,bd818690-73d2-445d-be5d-5c8f748dbb19,[(http://www.lacywear.ru/goods/category/15?utm...
9,bd81e006-f059-4cdd-b716-3467c78d1312,"[(http://nn.domru.ru/, 1426869172000)]"


In [283]:
predictions_df = spark.sql(f"""SELECT *
            FROM test""")

In [74]:
pDF = predictions_df.select(lit("").alias('key'), to_json(struct(*predictions_df.columns)).alias('value'))

In [21]:
pDF.show(5)

+---+--------------------+
|key|               value|
+---+--------------------+
|   |{"uid":"bd7a30e1-...|
|   |{"uid":"bd7a6f52-...|
|   |{"uid":"bd7a7fd9-...|
|   |{"uid":"bd7c5d7a-...|
|   |{"uid":"bd7e54a2-...|
+---+--------------------+
only showing top 5 rows



In [36]:
pDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").show(5)

+---+--------------------+
|key|               value|
+---+--------------------+
|   |{"uid":"bd7a30e1-...|
|   |{"uid":"bd7a6f52-...|
|   |{"uid":"bd7a7fd9-...|
|   |{"uid":"bd7c5d7a-...|
|   |{"uid":"bd7e54a2-...|
+---+--------------------+
only showing top 5 rows



In [75]:
(
    pDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
        .write
        .format("kafka")
#         .outputMode("append")
#         .option("checkpointLoacation", "chekpoints/kam_checkpoints_lab04")
#         .option("kafka.batch.size", 5000)
        .option("kafka.bootstrap.servers", "spark-master-1.newprolab.com:6667")
        .option("topic", "artem.kropis")
        .save()
)

In [65]:
predictions_df.show(5)

+--------------------+------+-----+
|                 uid|gender|  age|
+--------------------+------+-----+
|bd7a30e1-a25d-4cb...|     F|25-34|
|bd7a6f52-45db-49b...|     F|25-34|
|bd7a7fd9-ab06-42f...|     F|25-34|
|bd7c5d7a-0def-41d...|     F|25-34|
|bd7e54a2-0215-45c...|     F|25-34|
+--------------------+------+-----+
only showing top 5 rows



In [429]:
kafka_read_df_test = (
        spark.read
        .format('kafka')
        .option("kafka.bootstrap.servers", 'spark-master-1.newprolab.com:6667')
        .option("subscribe", "artem.kropis")
#         .option("startingOffsets", "latest")
        .option("failOnDataloss", "False")
        .load()
        .cache()
)

In [427]:
kafka_read_df_test.count()

94880

In [430]:
kafka_read_df_test.count()

99880

In [88]:
predictions_df.limit(5).show()

+--------------------+------+-----+
|                 uid|gender|  age|
+--------------------+------+-----+
|bd7a30e1-a25d-4cb...|     M|18-24|
|bd7a6f52-45db-49b...|     M|18-24|
|bd7a7fd9-ab06-42f...|     M|18-24|
|bd7c5d7a-0def-41d...|     M|18-24|
|bd7e54a2-0215-45c...|     M|18-24|
+--------------------+------+-----+



# Обучение

In [38]:
schema = (
    StructType()
    .add("gender", StringType(), True)
    .add("age", StringType(), True)
    .add("uid", StringType(), True)
    .add("user_json", StringType(), True)
)
      
df_train = (spark.read.format("csv")
            .option("header", True)
            .option("sep", "\t")
            .schema(schema)
            .load("/labs/slaba04/gender_age_dataset.txt")
)
# .select(from_json("value", event_schema).alias("event"))
df_train.createOrReplaceTempView("train")

In [39]:
df_train = spark.sql("""SELECT gender, age, uid, user_json
          FROM train
          where gender != '-'""")

In [40]:
df_train = df_train.select("gender", "age", "uid", from_json("user_json", event_schema).alias("event")) \
             .select("gender", "age", "uid", from_json(col("event.visits"), visit_schema).alias("visits"))

In [41]:
df_train = df_train.select("gender", "age", "uid", explode(df_train.visits))

In [42]:
df_train = df_train.selectExpr("gender", "age", "uid", "parse_url(col.url, 'HOST') as host")
df_train = df_train.filter(df_train.host.isNotNull())

In [43]:
#df_train.groupby('host').count().sort(col('count').desc()).show(10)
#df_train.replace('www.', '', 'host').select('host').distinct().count()

df_train = df_train\
    .withColumn('domain', regexp_replace('host', 'www.', ''))\
    .select('gender', 'age', 'uid', 'domain')
# df_train.filter(df_train.host.contains('www.')).withColumn('domain', regexp_replace('host', 'www.', '')).select('domain').distinct().count()
#df_train.filter(df_train.host.isNull()).show(10)

In [44]:
df_train.createOrReplaceTempView("train")

In [574]:
df_train

DataFrame[gender: string, age: string, uid: string, domain: string]

In [45]:
spark.sql("""SELECT DISTINCT domain 
        FROM train
        GROUP BY domain
        HAVING count(1) <= 2""").createOrReplaceTempView("train_not")

In [13]:
df_train = spark.sql("""SELECT /*broadcast(tn)*/
            t.* 
        FROM train t
        JOIN train_not tn
            on t.domain = tn.domain
        """).cache()

In [46]:
df_train.count()

5184901

In [47]:
df_train = df_train.groupBy("gender", "age", "uid").agg(collect_set('domain').alias('domain'))

# count_vectorizer = CountVectorizer(inputCol='domain', outputCol="domain_vector", binary=False)
# count_vectorizer_model = count_vectorizer.fit(df_train)
# df_train_vect = count_vectorizer_model.transform(df_train)

hasher_freq = HashingTF(numFeatures=1000, binary=False, inputCol="domain", outputCol="domain_vector")
df_train_vector = hasher_freq.transform(df_train)

normalizer = Normalizer(inputCol='domain_vector', outputCol="domain_norm")
df_train_norm = normalizer.transform(df_train_vector)

In [48]:
df_train_norm = df_train_norm.replace(['F', 'M'], ['0','1'], "gender") \
                     .replace(['18-24', '25-34', '35-44', '45-54', '>=55'], ['1','3','5','7','9'], "age")

In [49]:
df_train_norm.createOrReplaceTempView("train_norm")

In [50]:
df_train_norm = spark.sql("""SELECT CAST(gender AS FLOAT) as gender,  CAST(age AS INT) age, domain_norm
            FROM train_norm""")

In [51]:
lr_gendr = LogisticRegression(featuresCol='domain_norm', labelCol="gender", maxIter=29, regParam=0.020436539365475917)
lr_gendr_model = lr_gendr.fit(df_train_norm)


In [52]:
rf_age = RandomForestClassifier(labelCol="age", featuresCol="domain_norm")
rf_age_model = rf_age.fit(df_train_norm)

In [251]:
lr_age = RandomForest(featuresCol='domain_norm', labelCol="age", maxIter=15)
lr_age_model = lr_age.fit(df_train_norm)

Py4JJavaError: An error occurred while calling o3368.fit.
: java.lang.UnsupportedOperationException: empty.min
	at scala.collection.TraversableOnce$class.min(TraversableOnce.scala:222)
	at scala.collection.mutable.ArrayOps$ofDouble.min(ArrayOps.scala:270)
	at org.apache.spark.ml.classification.LogisticRegression$$anonfun$train$1.apply(LogisticRegression.scala:523)
	at org.apache.spark.ml.classification.LogisticRegression$$anonfun$train$1.apply(LogisticRegression.scala:494)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:494)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:489)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:279)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


In [40]:
!hadoop fs -ls /labs/slaba04

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


In [None]:
s

In [83]:
def create_console_sink_with_checkpoint(chk_name, df): 
    return df \
        .writeStream \
        .format("console") \
        .trigger(processingTime="10 seconds") \
        .option("checkpointLocation", "/tmp/chk_gr/input_artem.kropis") \
        .option("truncate", "false") \
        .option("numRows", "20")

In [10]:
SparkSession.builder.getOrCreate().streams.active[1].lastProgress["sources"]

IndexError: list index out of range

In [37]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    if streams:
        for s in streams:
#             desc = s.lastProgress["sources"][0]["description"]
            s.stop()
#             print("Stopped {s}".format(s=desc))
    print('Ok')

In [None]:
kill_all()

In [8]:
kafka_sdf.show()

# df.groupBy(col("topic"), col("partition")).count().show()

AnalysisException: 'Queries with streaming sources must be executed with writeStream.start();;\nkafka'

In [6]:
read_kafka_params

{'kafka.bootstrap.servers': 'spark-master-4.newprolab.com:6667',
 'subscribe': 'input_artem.kropis',
 'startingOffsets': 'latest'}

In [None]:
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "ivan.ivanov"
}
batch_df.writeStream.format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .option("failOnDataLoss", 'False')\
    .outputMode("append").start()

In [3]:
spark.range(10).isStreaming

False

In [4]:
sdf = spark.readStream.format("rate").load()
sdf

DataFrame[timestamp: timestamp, value: bigint]

In [5]:
sdf.isStreaming

True

У `sdf`, как и у любого DF, есть схема и план выполнения:

In [6]:
sdf.printSchema()
sdf.explain(True)

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)

== Parsed Logical Plan ==
StreamingRelationV2 org.apache.spark.sql.execution.streaming.sources.RateStreamProvider@39325d82, rate, [timestamp#2, value#3L]

== Analyzed Logical Plan ==
timestamp: timestamp, value: bigint
StreamingRelationV2 org.apache.spark.sql.execution.streaming.sources.RateStreamProvider@39325d82, rate, [timestamp#2, value#3L]

== Optimized Logical Plan ==
org.apache.spark.sql.AnalysisException: Queries with streaming sources must be executed with writeStream.start();;
rate
== Physical Plan ==
org.apache.spark.sql.AnalysisException: Queries with streaming sources must be executed with writeStream.start();;
rate


В отличии от обычных DF, у `sdf` нет таких методов, как `show`, `collect`, `take`. Поэтому для того, чтобы посмотреть их содержимое, мы должны использовать `console` синк и создать `StreamingQuery`. Процессинг начинается только после вызова метода `start`. `trigger` позволяет настроить, как часто стрим будет читать новые данные и обрабатывать их

In [7]:
def create_console_sink(df):
    return df \
            .writeStream \
            .format("console") \
            .trigger(processingTime="5 seconds") \
            .option("truncate", "false") \
            .option("numRows", "20")

In [8]:
sink = create_console_sink(sdf)
sink

<pyspark.sql.streaming.DataStreamWriter at 0x7fd1949d85f8>

In [9]:
sq = sink.start() # StreamingQuery
sq.isActive

True

Чтобы остановить DF, можно вызвать метод `stop` к `sdf`, либо получить список всех streming DF и остановить их:

In [10]:
sq.stop()

In [11]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    if streams:
        for s in streams:
            desc = s.lastProgress["sources"][0]["description"]
            s.stop()
            print("Stopped {s}".format(s=desc))

In [12]:
kill_all()

Создадим стрим, выполняющий запись в `parquet` файл:

In [13]:
def create_parquet_sink(df, file_name):
    return df \
            .repartition(1) \
            .writeStream \
            .format("parquet") \
            .option("path", "{f}".format(f=file_name)) \
            .option("checkpointLocation", "/tmp/chk_sg/{f}".format(f=file_name)) \
            .trigger(processingTime="10 seconds")

In [14]:
sink = create_parquet_sink(sdf, "ss_01.parquet")
sink

<pyspark.sql.streaming.DataStreamWriter at 0x7fd1949e8438>

In [15]:
sq = sink.start()
sq

<pyspark.sql.streaming.StreamingQuery at 0x7fd1949e8b38>

In [16]:
sq.isActive

True

In [17]:
kill_all()

Stopped RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default


Убедимся, что стрим пишется в файл:

In [18]:
!hadoop fs -ls ss_01.parquet

Found 7 items
drwxr-xr-x   - sergey.grishaev sergey.grishaev          0 2022-07-21 19:36 ss_01.parquet/_spark_metadata
-rw-r--r--   3 sergey.grishaev sergey.grishaev        722 2022-07-21 19:36 ss_01.parquet/part-00000-36769261-e319-481f-849f-5cc45056ac44-c000.snappy.parquet
-rw-r--r--   3 sergey.grishaev sergey.grishaev     133022 2022-07-21 17:47 ss_01.parquet/part-00000-58add991-8da6-40fb-9d36-76eb30639b79-c000.snappy.parquet
-rw-r--r--   3 sergey.grishaev sergey.grishaev        392 2022-07-21 14:47 ss_01.parquet/part-00000-84997645-f350-473d-b65e-f38caeda4fff-c000.snappy.parquet
-rw-r--r--   3 sergey.grishaev sergey.grishaev      32750 2022-07-21 19:36 ss_01.parquet/part-00000-8acdf9ff-a5ee-4e3d-a352-0da4df1e5e20-c000.snappy.parquet
-rw-r--r--   3 sergey.grishaev sergey.grishaev      38485 2022-07-21 18:39 ss_01.parquet/part-00000-ac360620-b92a-4d22-b40d-fd266bf3db07-c000.snappy.parquet
-rw-r--r--   3 sergey.grishaev sergey.grishaev       9224 2022-07-21 18:52 ss_01.parquet/

Прочитаем файл с помощью Spark:

In [19]:
from pyspark.sql.functions import max, col

rates = spark.read.parquet("ss_01.parquet")
print(rates.count())
rates.printSchema()
rates.show(5, False)
rates.select(max(col("timestamp"))).show(10, False)

17353
root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)

+-----------------------+-----+
|timestamp              |value|
+-----------------------+-----+
|2022-07-21 14:47:26.098|0    |
|2022-07-21 14:47:28.098|2    |
|2022-07-21 14:47:30.098|4    |
|2022-07-21 14:47:32.098|6    |
|2022-07-21 14:47:34.098|8    |
+-----------------------+-----+
only showing top 5 rows

+-----------------------+
|max(timestamp)         |
+-----------------------+
|2022-07-21 19:36:38.098|
+-----------------------+



Параллельно внутри одного Spark приложения может работать несколько стримов:

In [22]:
console_sink = create_console_sink(sdf)
console_sink_1 = create_console_sink(sdf)

console_sq = console_sink.start()
console_sq_1 = console_sink.start()

In [23]:
kill_all()

Stopped RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default
Stopped RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default


Напишем функцию, которая добавляет к нашей колонке случайный `ident` аэропорта из датасета [Airport Codes](https://datahub.io/core/airport-codes)  

In [24]:
csv_options = {"header": "true", "inferSchema": "true"}
airports = spark.read.options(**csv_options).csv("airport-codes_csv.csv")
airports.printSchema()
airports.show(1, 200, True)

root
 |-- ident: string (nullable = true)
 |-- type: string (nullable = true)
 |-- name: string (nullable = true)
 |-- elevation_ft: integer (nullable = true)
 |-- continent: string (nullable = true)
 |-- iso_country: string (nullable = true)
 |-- iso_region: string (nullable = true)
 |-- municipality: string (nullable = true)
 |-- gps_code: string (nullable = true)
 |-- iata_code: string (nullable = true)
 |-- local_code: string (nullable = true)
 |-- coordinates: string (nullable = true)

-RECORD 0------------------------------------------
 ident        | 00A                                
 type         | heliport                           
 name         | Total Rf Heliport                  
 elevation_ft | 11                                 
 continent    | NA                                 
 iso_country  | US                                 
 iso_region   | US-PA                              
 municipality | Bensalem                           
 gps_code     | 00A                 

In [25]:
from pyspark.sql.functions import col
idents_rows = airports.select(col("ident")).limit(200).distinct().collect()
idents = [row["ident"] for row in idents_rows]
idents

['00A',
 '00AA',
 '00AK',
 '00AL',
 '00AR',
 '00AS',
 '00AZ',
 '00CA',
 '00CL',
 '00CN',
 '00CO',
 '00FA',
 '00FD',
 '00FL',
 '00GA',
 '00GE',
 '00HI',
 '00ID',
 '00IG',
 '00II',
 '00IL',
 '00IN',
 '00IS',
 '00KS',
 '00KY',
 '00LA',
 '00LL',
 '00LS',
 '00MD',
 '00MI',
 '00MN',
 '00MO',
 '00MT',
 '00N',
 '00NC',
 '00NJ',
 '00NK',
 '00NY',
 '00OH',
 '00OI',
 '00OK',
 '00OR',
 '00PA',
 '00PN',
 '00PS',
 '00S',
 '00SC',
 '00SD',
 '00TA',
 '00TE',
 '00TN',
 '00TS',
 '00TX',
 '00UT',
 '00VA',
 '00VI',
 '00W',
 '00WA',
 '00WI',
 '00WN',
 '00WV',
 '00WY',
 '00XS',
 '01A',
 '01AK',
 '01AL',
 '01AR',
 '01AZ',
 '01C',
 '01CA',
 '01CL',
 '01CN',
 '01CO',
 '01CT',
 '01FA',
 '01FD',
 '01FL',
 '01GA',
 '01GE',
 '01IA',
 '01ID',
 '01II',
 '01IL',
 '01IN',
 '01IS',
 '01J',
 '01K',
 '01KS',
 '01KY',
 '01LA',
 '01LL',
 '01LS',
 '01MA',
 '01MD',
 '01ME',
 '01MI',
 '01MN',
 '01MO',
 '01MT',
 '01NC',
 '01NE',
 '01NH',
 '01NJ',
 '01NM',
 '01NV',
 '01NY',
 '01OI',
 '01OK',
 '01OR',
 '01PA',
 '01PN',
 '01PS',


In [26]:
from pyspark.sql.functions import shuffle, array, lit
idents_cols = [lit(i) for i in idents]
idents_array = array(*idents_cols)
shuffled = shuffle(idents_array)
rand_ident = shuffled[0]

ident_sdf = sdf.withColumn("ident", rand_ident)
ident_sdf

DataFrame[timestamp: timestamp, value: bigint, ident: string]

In [27]:
ident_pq_sink = create_parquet_sink(ident_sdf, "s2.parquet")
ident_pq_sq = ident_pq_sink.start()

In [28]:
!rm -rf /tmp/chk_sg/*

ls: cannot access '/tmp/chk_sg/*': No such file or directory


Проверим, что данные записываются в `parquet`

In [29]:
ident_pq = spark.read.parquet("s2.parquet")
print(ident_pq.count())
ident_pq.printSchema()
ident_pq.show(5, False)

7282
root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)
 |-- ident: string (nullable = true)

+-----------------------+-----+-----+
|timestamp              |value|ident|
+-----------------------+-----+-----+
|2022-07-21 18:52:22.571|3525 |01IA |
|2022-07-21 18:52:24.571|3527 |02OK |
|2022-07-21 18:52:26.571|3529 |01MT |
|2022-07-21 18:52:28.571|3531 |03AZ |
|2022-07-21 18:52:30.571|3533 |02CT |
+-----------------------+-----+-----+
only showing top 5 rows



Временно остановим стрим, он понадобится нам для следующих экспериментов

In [30]:
kill_all()

Stopped RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default


### Выводы:
- `rate` - самый простой способ создать стрим для тестирования приложений
- стрим начинает работу после вызова метода `start` и не блокирует основной поток программы
- в одном Spark приложении может работать несколько стримов одновременно

## File Streaming
Spark позволяет запустить стрим, который будет "слушать" директорию и читать из нее новые файлы. При этом за раз будет прочитано количество файлов, установленное в параметре `maxFilesPerTrigger` [ссылка](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#input-sources). В этом кроется одна из основных проблем данного источника. Поскольку стрим, сконфигурированный под чтение небольших файлов, может "упасть", если в директорию начнут попадать файлы большого объема. Создадим стрим из директории `datasets/s2.parquet`:

In [31]:
sdf_from_parquet = spark \
                    .readStream \
                    .format("parquet") \
                    .option("maxFilesPerTrigger", "1") \
                    .option("path", "s2.parquet") \
                    .load()

sdf_from_parquet.printSchema()

IllegalArgumentException: "Schema must be specified when creating a streaming source DataFrame. If some files already exist in the directory, then depending on the file format you may be able to create a static DataFrame on that directory with 'spark.read.load(directory)' and infer schema from it."

Поскольку в директорию могут попасть любые данные, а df должен иметь фиксированную схему, то Spark не позволяет нам создавать SDF на основе файлов без указания схемы.

In [32]:
from pyspark.sql.functions import lower

sdf_from_parquet = spark \
                    .readStream \
                    .format("parquet") \
                    .schema(ident_pq.schema) \
                    .option("maxFilesPerTrigger", "1") \
                    .option("path", "s2.parquet") \
                    .load() \
                    .withColumn("ident", lower(col("ident")))

sdf_from_parquet.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)
 |-- ident: string (nullable = true)



In [33]:
console_sink = create_console_sink(sdf_from_parquet)
console_sink.start()

<pyspark.sql.streaming.StreamingQuery at 0x7fd1a806ba20>

In [34]:
kill_all()

Stopped FileStreamSource[hdfs://spark-master-1.newprolab.com:8020/user/sergey.grishaev/s2.parquet]


File source позволяет со всеми типами файлов, с которыми умеет работать Spark: `parquet`, `orc`, `csv`, `json`, `text`.

### Выводы:
- Spark позволяет создавать SDF на базе всех поддерживаемых типов файлов
- При создании SDF вы должны указать схему данных
- File streaming имеет несколько серьезных недостатков:
  + Входной поток можно ограничить только макисмальным количество файлов, попадающих в батч
  + Если стрим упадает посередине файла, то при перезапуске эти данные будут обработаны еще раз

<img align="right" width="100" height="100" src="https://upload.wikimedia.org/wikipedia/commons/thumb/0/05/Apache_kafka.svg/1200px-Apache_kafka.svg.png">

## Kafka streaming

https://kafka.apache.org

**Apache Kafka** - самая распространенная в мире система, на основе которой строятся приложения для поточной обработки данных. Она имеет несколько преимуществ:
- высокая пропускная способность
- высокая доступность за счет распределенной архитектуры и репликации
- у каждого сообщения есть свой номер, который называется offset, что позволяет гранулярно сохранять состояние стрима

### Архитектура системы

#### Topic
Топик - это таблицы в Kafka. Мы пишем данные в топик и читаем данные из топика. Топик как правило распределен по нескольким узлам кластера для обеспечения высокой доступности и скорости работы с данными

<img align="center" width="500" height="500" src="https://kafka.apache.org/25/images/log_anatomy.png">

#### Partition
Партиции - это блоки, из которых состоят топики. Партиция представляет собой неделимый блок, который хранится на одном из узлов. Топик может иметь произвольное количество партиций. Чем больше партиций - тем выше параллелзим при чтении и записи, однако слишком большое число партиций в топике может привести к замедлению работы всей системы.

#### Replica
Каждая партиция имеет (может иметь) несколько реплик. Внешние приложения всегда работают (читают и пишут) с основной репликой. Остальные реплики являются дочерними и не используются во внешнем IO. Если узел, на котором расположена основная реплика, падает, то одна из дочерних реплик становится основной и работа с данными продолжается

#### Message
Сообщения - это данные, которые мы пишем и читаем в Kafka. Они представлены кортежем (Key, Value), но ключ может быть иметь значение `null` (используется не всегда). Сереализация и десереализация данных всегда происходит на уровне клиентов Kafka. Сама Kafka ничего о типах данных не знает и хранит ключи и значения в виде массива байт

#### Offset
Оффсет - это порядковый номер сообщения в партиции. Когда мы пишем сообщение (сообщение всегда пишется в одну из партиций топика), Kafka помещает его в топик с номер `n+1`, где `n` - номер последнего сообщения в этом топике

<img align="center" width="400" height="400" src="https://kafka.apache.org/25/images/log_consumer.png">

#### Producer
Producer - это приложение, которое пишет в топик. Producer'ов может быть много. Параллельная запись достигается за счет того, что каждое новое сообщение попадает в случайную партицию топика (если не указан `key`)

#### Consumer
Consumer - это приложение, читающее данные из топика. Consumer'ов может быть много, в этом случае они называются `consumer group`. Параллельное чтение достигается за счет распределения партиций топика между consumer'ами в рамках одной группы. Каждый consumer читает данные из "своих" партиций и ничего про другие не знает. Если consumer падает, то "его" партиции переходят другим consumer'ам.

#### Commit
Коммитом в Kafka называют сохранение информации о факте обработки сообщения с определенным оффсетом. Поскольку оффсеты для каждой партиции топика свои, то и информация о последнем обработанном оффсете хранится по каждой партиции отдельно. Обычные приложения пишут коммиты в специальный топик Kafka, который имеет название `__consumer_offsets`. Spark хранит обработанные оффсеты по каждому батчу в ФС (например, в HDFS).

#### Retention
Поскольку кластер Kafka не может хранить данные вечно, то в ее конфигурации задаются пороговые значение по **объему** и **времени хранения** для каждого топика, при превышении которых данные удаляются. Например, если у топика A установлен renention по времени 1 месяц, то данные будут хранится в системе не менее одного месяца (и затем будут удалены одной из внутренних подсистем)

### Spark connector
https://mvnrepository.com/artifact/org.apache.spark/spark-sql-kafka-0-10  
https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html  

### Запуск Kafka в docker
```shell
docker run --rm \
   -p 2181:2181 \
   --name=test_zoo \
   -e ZOOKEEPER_CLIENT_PORT=2181 \
   confluentinc/cp-zookeeper
```

```shell
docker run --rm \
    -p 9092:9092 \
    --name=test_kafka \
    -e KAFKA_ZOOKEEPER_CONNECT=host.docker.internal:2181 \
    -e KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://host.docker.internal:9092 \
    -e KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1 \
    confluentinc/cp-kafka
```

### Работа с Kafka с помощь Static Dataframe

Spark позволяет работать с кафкой как с обычной базой данных. Запишем данные в топик `test_topic0`. Для этого нам необходимо подготовить DF, в котором будет две колонки:
- `value: String` - данные, которые мы хотим записать
- `topic: String` - топик, куда писать каждую строку DF

In [35]:
ident_pq = spark.read.parquet("s2.parquet")
print(ident_pq.count())
ident_pq.printSchema()
ident_pq.show(5, False)

7292
root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)
 |-- ident: string (nullable = true)

+-----------------------+-----+-----+
|timestamp              |value|ident|
+-----------------------+-----+-----+
|2022-07-21 18:52:22.571|3525 |01IA |
|2022-07-21 18:52:24.571|3527 |02OK |
|2022-07-21 18:52:26.571|3529 |01MT |
|2022-07-21 18:52:28.571|3531 |03AZ |
|2022-07-21 18:52:30.571|3533 |02CT |
+-----------------------+-----+-----+
only showing top 5 rows



In [36]:
from pyspark.sql.functions import struct, to_json

def write_kafka(topic, data):
    kafka_params = {"kafka.bootstrap.servers": "spark-master-1.newprolab.com:6667"}
    kafka_doc = to_json(struct(col("*")))
    raw = data \
        .select(kafka_doc.alias("value")) \
        .withColumn("topic", lit(topic))
    
    raw.show(5, False)
    
    raw \
        .write.format("kafka") \
        .options(**kafka_params).save()

In [37]:
write_kafka("test_topic2", ident_pq)

+-------------------------------------------------------------------------+-----------+
|value                                                                    |topic      |
+-------------------------------------------------------------------------+-----------+
|{"timestamp":"2022-07-21T18:52:22.571+03:00","value":3525,"ident":"01IA"}|test_topic2|
|{"timestamp":"2022-07-21T18:52:24.571+03:00","value":3527,"ident":"02OK"}|test_topic2|
|{"timestamp":"2022-07-21T18:52:26.571+03:00","value":3529,"ident":"01MT"}|test_topic2|
|{"timestamp":"2022-07-21T18:52:28.571+03:00","value":3531,"ident":"03AZ"}|test_topic2|
|{"timestamp":"2022-07-21T18:52:30.571+03:00","value":3533,"ident":"02CT"}|test_topic2|
+-------------------------------------------------------------------------+-----------+
only showing top 5 rows



Прочитаем данные из Kafka:

In [38]:
kafka_params = {
    "kafka.bootstrap.servers": "spark-master-1.newprolab.com:6667",
    "subscribe": "test_topic0, test_topic1, test_topic2"
}

df = spark.read.format("kafka").options(**kafka_params).load()

df.printSchema()
df.show()

df.groupBy(col("topic"), col("partition")).count().show()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)

+----+--------------------+-----------+---------+------+--------------------+-------------+
| key|               value|      topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----------+---------+------+--------------------+-------------+
|null|[7B 22 74 69 6D 6...|test_topic2|        0|    43|2022-07-21 18:40:...|            0|
|null|[7B 22 74 69 6D 6...|test_topic2|        0|    44|2022-07-21 18:40:...|            0|
|null|[7B 22 74 69 6D 6...|test_topic2|        0|    45|2022-07-21 18:40:...|            0|
|null|[7B 22 74 69 6D 6...|test_topic2|        0|    46|2022-07-21 18:40:...|            0|
|null|[7B 22 74 69 6D 6...|test_topic2|        0|    47|2022-07-21 18:40:

Чтение из Kafka имеет несколько особенностей:
- по умолчанию читается все содержимое топика. Поскольку обычно в нем много данных, эта операция может создать большую нагрузку на кластер Kafka и Spark приложение
- колонки `value` и `key` имеют тип `binary`, который необходимо десереализовать

In [39]:
df.select(col("value").cast("string")).show(10, False)

+------------------------------------------------------------------------+
|value                                                                   |
+------------------------------------------------------------------------+
|{"timestamp":"2022-07-21T17:54:39.571+03:00","value":62,"ident":"02KT"} |
|{"timestamp":"2022-07-21T17:55:29.571+03:00","value":112,"ident":"02XA"}|
|{"timestamp":"2022-07-21T17:54:41.571+03:00","value":64,"ident":"01MO"} |
|{"timestamp":"2022-07-21T17:55:31.571+03:00","value":114,"ident":"02MN"}|
|{"timestamp":"2022-07-21T17:54:43.571+03:00","value":66,"ident":"00WI"} |
|{"timestamp":"2022-07-21T17:55:33.571+03:00","value":116,"ident":"03AL"}|
|{"timestamp":"2022-07-21T17:54:45.571+03:00","value":68,"ident":"00CO"} |
|{"timestamp":"2022-07-21T17:55:35.571+03:00","value":118,"ident":"01PA"}|
|{"timestamp":"2022-07-21T17:54:47.571+03:00","value":70,"ident":"00OK"} |
|{"timestamp":"2022-07-21T17:55:37.571+03:00","value":120,"ident":"02MT"}|
+------------------------

In [40]:
# get_json_object
# from_json
# json_tuple
from pyspark.sql.functions import *
df.select(get_json_object(col("value").cast("string"), "$.ident").alias("ident")).show()

+-----+
|ident|
+-----+
| 02KT|
| 02XA|
| 01MO|
| 02MN|
| 00WI|
| 03AL|
| 00CO|
| 01PA|
| 00OK|
| 02MT|
| 00IG|
| 02TA|
| 00SC|
| 00OK|
|  00A|
| 01VA|
| 00KS|
| 00OR|
| 00CN|
| 01CO|
+-----+
only showing top 20 rows



In [41]:
df.select(
    json_tuple(col("value").cast("string"), "ident", "value").alias("ident", "value")
).show()

+-----+-----+
|ident|value|
+-----+-----+
| 02KT|   62|
| 02XA|  112|
| 01MO|   64|
| 02MN|  114|
| 00WI|   66|
| 03AL|  116|
| 00CO|   68|
| 01PA|  118|
| 00OK|   70|
| 02MT|  120|
| 00IG|   63|
| 02TA|  122|
| 00SC|   65|
| 00OK|  124|
|  00A|   67|
| 01VA|  126|
| 00KS|  128|
| 00OR|  130|
| 00CN|   69|
| 01CO|  132|
+-----+-----+
only showing top 20 rows



In [42]:
df.select(from_json(col("value").cast("string"), ident_pq.schema).alias("s")) \
    .select(col("s.*")).show()

+--------------------+-----+-----+
|           timestamp|value|ident|
+--------------------+-----+-----+
|2022-07-21 17:54:...|   62| 02KT|
|2022-07-21 17:55:...|  112| 02XA|
|2022-07-21 17:54:...|   64| 01MO|
|2022-07-21 17:55:...|  114| 02MN|
|2022-07-21 17:54:...|   66| 00WI|
|2022-07-21 17:55:...|  116| 03AL|
|2022-07-21 17:54:...|   68| 00CO|
|2022-07-21 17:55:...|  118| 01PA|
|2022-07-21 17:54:...|   70| 00OK|
|2022-07-21 17:55:...|  120| 02MT|
|2022-07-21 17:54:...|   63| 00IG|
|2022-07-21 17:55:...|  122| 02TA|
|2022-07-21 17:54:...|   65| 00SC|
|2022-07-21 17:55:...|  124| 00OK|
|2022-07-21 17:54:...|   67|  00A|
|2022-07-21 17:55:...|  126| 01VA|
|2022-07-21 17:55:...|  128| 00KS|
|2022-07-21 17:55:...|  130| 00OR|
|2022-07-21 17:54:...|   69| 00CN|
|2022-07-21 17:55:...|  132| 01CO|
+--------------------+-----+-----+
only showing top 20 rows



Чтобы прочитать только определенную часть топика, нам необходимо задать минимальный и максимальный оффсет для чтения с помощью параметров `startingOffsets` , `endingOffsets`. Возьмем два случайных события:

In [None]:
# df.sample(0.1).limit(2).select(col("topic"), col("partition"), col("offset")).show()

На основании этих событий подготовим параметры `startingOffsets` и `endingOffsets`

In [43]:
kafka_params = {
    "kafka.bootstrap.servers": "spark-master-1.newprolab.com:6667",
    "subscribe": "test_topic0,test_topic1,test_topic2",
    "startingOffsets": """ { "test_topic0": { "0": 4 },"test_topic1": { "0": 5 },"test_topic2": { "0": 6 } } """,
    "endingOffsets": """ { "test_topic0": { "0": 7 },"test_topic1": { "0": 8 },"test_topic2": { "0": 10 } }  """
}


df = spark.read.format("kafka").options(**kafka_params).load()

df.printSchema()
df.show(20)

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



Py4JJavaError: An error occurred while calling o607.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 88.0 failed 1 times, most recent failure: Lost task 0.0 in stage 88.0 (TID 327, localhost, executor driver): java.lang.IllegalStateException: Cannot fetch offset 4 (GroupId: spark-kafka-relation-f93a047d-182b-4aea-bd71-037d420abf4c-executor, TopicPartition: test_topic0-0). 
Some data may have been lost because they are not available in Kafka any more; either the
 data was aged out by Kafka or the topic may have been deleted before all the data in the
 topic was processed. If you don't want your streaming query to fail on such cases, set the
 source option "failOnDataLoss" to "false".
    
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$.org$apache$spark$sql$kafka010$InternalKafkaConsumer$$reportDataLoss0(KafkaDataConsumer.scala:642)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.org$apache$spark$sql$kafka010$InternalKafkaConsumer$$reportDataLoss(KafkaDataConsumer.scala:448)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$$anonfun$get$1.apply(KafkaDataConsumer.scala:269)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$$anonfun$get$1.apply(KafkaDataConsumer.scala:234)
	at org.apache.spark.util.UninterruptibleThread.runUninterruptibly(UninterruptibleThread.scala:77)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.runUninterruptiblyIfPossible(KafkaDataConsumer.scala:209)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.get(KafkaDataConsumer.scala:234)
	at org.apache.spark.sql.kafka010.KafkaDataConsumer$class.get(KafkaDataConsumer.scala:64)
	at org.apache.spark.sql.kafka010.KafkaDataConsumer$NonCachedKafkaDataConsumer.get(KafkaDataConsumer.scala:506)
	at org.apache.spark.sql.kafka010.KafkaSourceRDD$$anon$1.getNext(KafkaSourceRDD.scala:113)
	at org.apache.spark.sql.kafka010.KafkaSourceRDD$$anon$1.getNext(KafkaSourceRDD.scala:104)
	at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.kafka.clients.consumer.OffsetOutOfRangeException: Offsets out of range with no configured reset policy for partitions: {test_topic0-0=4}
	at org.apache.kafka.clients.consumer.internals.Fetcher.parseCompletedFetch(Fetcher.java:970)
	at org.apache.kafka.clients.consumer.internals.Fetcher.fetchedRecords(Fetcher.java:490)
	at org.apache.kafka.clients.consumer.KafkaConsumer.pollForFetches(KafkaConsumer.java:1259)
	at org.apache.kafka.clients.consumer.KafkaConsumer.poll(KafkaConsumer.java:1187)
	at org.apache.kafka.clients.consumer.KafkaConsumer.poll(KafkaConsumer.java:1115)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.fetchData(KafkaDataConsumer.scala:470)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.org$apache$spark$sql$kafka010$InternalKafkaConsumer$$fetchRecord(KafkaDataConsumer.scala:361)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$$anonfun$get$1.apply(KafkaDataConsumer.scala:251)
	... 35 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.IllegalStateException: Cannot fetch offset 4 (GroupId: spark-kafka-relation-f93a047d-182b-4aea-bd71-037d420abf4c-executor, TopicPartition: test_topic0-0). 
Some data may have been lost because they are not available in Kafka any more; either the
 data was aged out by Kafka or the topic may have been deleted before all the data in the
 topic was processed. If you don't want your streaming query to fail on such cases, set the
 source option "failOnDataLoss" to "false".
    
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$.org$apache$spark$sql$kafka010$InternalKafkaConsumer$$reportDataLoss0(KafkaDataConsumer.scala:642)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.org$apache$spark$sql$kafka010$InternalKafkaConsumer$$reportDataLoss(KafkaDataConsumer.scala:448)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$$anonfun$get$1.apply(KafkaDataConsumer.scala:269)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$$anonfun$get$1.apply(KafkaDataConsumer.scala:234)
	at org.apache.spark.util.UninterruptibleThread.runUninterruptibly(UninterruptibleThread.scala:77)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.runUninterruptiblyIfPossible(KafkaDataConsumer.scala:209)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.get(KafkaDataConsumer.scala:234)
	at org.apache.spark.sql.kafka010.KafkaDataConsumer$class.get(KafkaDataConsumer.scala:64)
	at org.apache.spark.sql.kafka010.KafkaDataConsumer$NonCachedKafkaDataConsumer.get(KafkaDataConsumer.scala:506)
	at org.apache.spark.sql.kafka010.KafkaSourceRDD$$anon$1.getNext(KafkaSourceRDD.scala:113)
	at org.apache.spark.sql.kafka010.KafkaSourceRDD$$anon$1.getNext(KafkaSourceRDD.scala:104)
	at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more
Caused by: org.apache.kafka.clients.consumer.OffsetOutOfRangeException: Offsets out of range with no configured reset policy for partitions: {test_topic0-0=4}
	at org.apache.kafka.clients.consumer.internals.Fetcher.parseCompletedFetch(Fetcher.java:970)
	at org.apache.kafka.clients.consumer.internals.Fetcher.fetchedRecords(Fetcher.java:490)
	at org.apache.kafka.clients.consumer.KafkaConsumer.pollForFetches(KafkaConsumer.java:1259)
	at org.apache.kafka.clients.consumer.KafkaConsumer.poll(KafkaConsumer.java:1187)
	at org.apache.kafka.clients.consumer.KafkaConsumer.poll(KafkaConsumer.java:1115)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.fetchData(KafkaDataConsumer.scala:470)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.org$apache$spark$sql$kafka010$InternalKafkaConsumer$$fetchRecord(KafkaDataConsumer.scala:361)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$$anonfun$get$1.apply(KafkaDataConsumer.scala:251)
	... 35 more


In [44]:
df.rdd.getNumPartitions()

3

По умолчанию параметр `startingOffsets` имеет значение `earliest`, а `endingOffsets` - `latest`. Поэтому, когда мы не указывали эти параметры, Spark прочитал содержимое всего топика

Чтобы получить наши данные, которые мы записали в топик, нам необходимо их десереализовать. В нашем случае достаточно использовать `.cast("string")`, однако это работает не всегда, т.к. формат данных может быть произвольным.

In [45]:
json_doc = df.select(col("value").cast("string"))

json_doc.show(20, False)

Py4JJavaError: An error occurred while calling o660.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 89.0 failed 1 times, most recent failure: Lost task 0.0 in stage 89.0 (TID 328, localhost, executor driver): java.lang.IllegalStateException: Cannot fetch offset 4 (GroupId: spark-kafka-relation-1e056068-ecf4-4a63-89da-1e4ff6044966-executor, TopicPartition: test_topic0-0). 
Some data may have been lost because they are not available in Kafka any more; either the
 data was aged out by Kafka or the topic may have been deleted before all the data in the
 topic was processed. If you don't want your streaming query to fail on such cases, set the
 source option "failOnDataLoss" to "false".
    
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$.org$apache$spark$sql$kafka010$InternalKafkaConsumer$$reportDataLoss0(KafkaDataConsumer.scala:642)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.org$apache$spark$sql$kafka010$InternalKafkaConsumer$$reportDataLoss(KafkaDataConsumer.scala:448)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$$anonfun$get$1.apply(KafkaDataConsumer.scala:269)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$$anonfun$get$1.apply(KafkaDataConsumer.scala:234)
	at org.apache.spark.util.UninterruptibleThread.runUninterruptibly(UninterruptibleThread.scala:77)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.runUninterruptiblyIfPossible(KafkaDataConsumer.scala:209)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.get(KafkaDataConsumer.scala:234)
	at org.apache.spark.sql.kafka010.KafkaDataConsumer$class.get(KafkaDataConsumer.scala:64)
	at org.apache.spark.sql.kafka010.KafkaDataConsumer$NonCachedKafkaDataConsumer.get(KafkaDataConsumer.scala:506)
	at org.apache.spark.sql.kafka010.KafkaSourceRDD$$anon$1.getNext(KafkaSourceRDD.scala:113)
	at org.apache.spark.sql.kafka010.KafkaSourceRDD$$anon$1.getNext(KafkaSourceRDD.scala:104)
	at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.kafka.clients.consumer.OffsetOutOfRangeException: Offsets out of range with no configured reset policy for partitions: {test_topic0-0=4}
	at org.apache.kafka.clients.consumer.internals.Fetcher.parseCompletedFetch(Fetcher.java:970)
	at org.apache.kafka.clients.consumer.internals.Fetcher.fetchedRecords(Fetcher.java:490)
	at org.apache.kafka.clients.consumer.KafkaConsumer.pollForFetches(KafkaConsumer.java:1259)
	at org.apache.kafka.clients.consumer.KafkaConsumer.poll(KafkaConsumer.java:1187)
	at org.apache.kafka.clients.consumer.KafkaConsumer.poll(KafkaConsumer.java:1115)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.fetchData(KafkaDataConsumer.scala:470)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.org$apache$spark$sql$kafka010$InternalKafkaConsumer$$fetchRecord(KafkaDataConsumer.scala:361)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$$anonfun$get$1.apply(KafkaDataConsumer.scala:251)
	... 35 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.IllegalStateException: Cannot fetch offset 4 (GroupId: spark-kafka-relation-1e056068-ecf4-4a63-89da-1e4ff6044966-executor, TopicPartition: test_topic0-0). 
Some data may have been lost because they are not available in Kafka any more; either the
 data was aged out by Kafka or the topic may have been deleted before all the data in the
 topic was processed. If you don't want your streaming query to fail on such cases, set the
 source option "failOnDataLoss" to "false".
    
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$.org$apache$spark$sql$kafka010$InternalKafkaConsumer$$reportDataLoss0(KafkaDataConsumer.scala:642)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.org$apache$spark$sql$kafka010$InternalKafkaConsumer$$reportDataLoss(KafkaDataConsumer.scala:448)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$$anonfun$get$1.apply(KafkaDataConsumer.scala:269)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$$anonfun$get$1.apply(KafkaDataConsumer.scala:234)
	at org.apache.spark.util.UninterruptibleThread.runUninterruptibly(UninterruptibleThread.scala:77)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.runUninterruptiblyIfPossible(KafkaDataConsumer.scala:209)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.get(KafkaDataConsumer.scala:234)
	at org.apache.spark.sql.kafka010.KafkaDataConsumer$class.get(KafkaDataConsumer.scala:64)
	at org.apache.spark.sql.kafka010.KafkaDataConsumer$NonCachedKafkaDataConsumer.get(KafkaDataConsumer.scala:506)
	at org.apache.spark.sql.kafka010.KafkaSourceRDD$$anon$1.getNext(KafkaSourceRDD.scala:113)
	at org.apache.spark.sql.kafka010.KafkaSourceRDD$$anon$1.getNext(KafkaSourceRDD.scala:104)
	at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more
Caused by: org.apache.kafka.clients.consumer.OffsetOutOfRangeException: Offsets out of range with no configured reset policy for partitions: {test_topic0-0=4}
	at org.apache.kafka.clients.consumer.internals.Fetcher.parseCompletedFetch(Fetcher.java:970)
	at org.apache.kafka.clients.consumer.internals.Fetcher.fetchedRecords(Fetcher.java:490)
	at org.apache.kafka.clients.consumer.KafkaConsumer.pollForFetches(KafkaConsumer.java:1259)
	at org.apache.kafka.clients.consumer.KafkaConsumer.poll(KafkaConsumer.java:1187)
	at org.apache.kafka.clients.consumer.KafkaConsumer.poll(KafkaConsumer.java:1115)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.fetchData(KafkaDataConsumer.scala:470)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer.org$apache$spark$sql$kafka010$InternalKafkaConsumer$$fetchRecord(KafkaDataConsumer.scala:361)
	at org.apache.spark.sql.kafka010.InternalKafkaConsumer$$anonfun$get$1.apply(KafkaDataConsumer.scala:251)
	... 35 more


### Работа с Kafka с помощью Streaming DF
При создании SDF из Kafka необходимо помнить, что:
- `startingOffsets` по умолчанию имеет значение `latest`
- `endingOffsets` использовать нельзя
- количество сообщений за батч можно (и нужно) ограничить параметром `maxOffsetPerTrigger` (по умолчанию он не задан и первый батч будет содержать данные всего топика

In [46]:
kill_all()

In [48]:
kafka_params = {
    "kafka.bootstrap.servers": "spark-master-1.newprolab.com:6667",
    "subscribe": "test_topic0,test_topic1,test_topic2",
    "startingOffsets": """earliest""",
    "maxOffsetsPerTrigger": "5",
    "minPartitions": "10"
}

sdf = spark.readStream.format("kafka").options(**kafka_params).load()
parsed_sdf = sdf.select(col("value").cast("string"), col("topic"), col("partition"), col("offset"))

sink = create_console_sink(parsed_sdf)

sq = sink.start()

In [49]:
kill_all()

Stopped KafkaV2[Subscribe[test_topic0, test_topic1, test_topic2]]
Stopped KafkaV2[Subscribe[test_topic0, test_topic1, test_topic2]]


Если мы перезапустим этот стрим, он повторно прочитает все данные. Чтобы обеспечить сохранение состояния стрима после обработки каждого батча, нам необходимо добавить параметр `checkpointLocation` в опции `writeStream`:

In [52]:
def create_console_sink_with_checkpoint(chk_name, df): 
    return df \
        .writeStream \
        .format("console") \
        .trigger(processingTime="10 seconds") \
        .option("checkpointLocation", "/tmp/chk_gr/{n}".format(n=chk_name)) \
        .option("truncate", "false") \
        .option("numRows", "20")

In [53]:
sink = create_console_sink_with_checkpoint("test0", parsed_sdf)
sq = sink.start()

In [54]:
kill_all()

Stopped KafkaV2[Subscribe[test_topic0, test_topic1, test_topic2]]


В конце работы не забудьте остановить Spark:

In [None]:
spark.stop()

In [None]:
!tree /tmp/chk/test0

In [None]:
!cat /tmp/chk/test0/offsets/6