In [1]:
pip install pyspark==3.5.2

Collecting pyspark==3.5.2
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m855.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812381 sha256=e3ab81483c37c124c818f631dc4c4003aab3c792dc64bcc056b3846d1914b131
  Stored in directory: /root/.cache/pip/wheels/11/67/ea/33c283e520b775aa7a7a0d404447e287be841a711d074d4d91
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.7 pyspark-3.5.2
[0m
[1m[[0m[34;4

In [2]:
# ---- Imports básicos de Spark SQL y funciones de streaming ----
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# ---- Crea/obtiene la SparkSession ----
# Agregamos el paquete del conector Kafka para Spark 3.5.x (Scala 2.12)
spark = (SparkSession.builder
         .config("spark.jars.packages",
                 "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.2")
         .appName("orders-streaming")
         .getOrCreate())

# Baja el ruido de logs
spark.sparkContext.setLogLevel("WARN")

# ---- Esquema del JSON que llega en el value de Kafka ----
# Define los campos esperados para poder parsear el mensaje como JSON
schema = StructType([
    StructField("order_id", IntegerType()),
    StructField("customer_id", IntegerType()),
    StructField("amount", DoubleType()),
    StructField("status", StringType()),
    StructField("event_time", StringType()),  # llega como string ISO8601; luego lo casteamos a timestamp
])

# Imprime la versión de Scala activa (útil para diagnosticar mismatches)
print("Scala runtime:", spark.sparkContext._jvm.scala.util.Properties.versionString())

# ---- Fuente de datos: Kafka (lectura streaming) ----
# Lee del tópico "orders" del broker "kafka:9092".
# Nota: por defecto startingOffsets=latest; si quieres leer desde el principio, añade .option("startingOffsets","earliest")
kafka_df = (spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "orders")
    # .option("startingOffsets", "earliest")
    # .option("failOnDataLoss", "false")  # útil en demos si hay rotación de offsets
  .load())

# ---- Parseo del payload JSON ----
# 1) value viene en binario -> CAST a STRING
# 2) Aplica from_json con el schema definido
# 3) Aplana la estructura para tener columnas reales (order_id, customer_id, etc.)
json_df = (kafka_df
    .selectExpr("CAST(value AS STRING) as value")
    .select(from_json(col("value"), schema).alias("data"))
    .select("data.*"))

# ---- Agregación 1: suma de montos por ventana tumbling de 1 minuto ----
# Castea event_time a timestamp y agrupa por ventana de 1 min
agg_by_min = (json_df
    .withColumn("ts", col("event_time").cast("timestamp"))
    .groupBy(window(col("ts"), "1 minute"))
    .sum("amount")
    .withColumnRenamed("sum(amount)", "amount_sum"))

# ---- Agregación 2: conteo por estado y cliente en ventanas de 1 minuto ----
agg_status_client = (json_df
    .withColumn("ts", col("event_time").cast("timestamp"))
    .groupBy(
        window(col("ts"), "1 minute"),
        col("status"),
        col("customer_id")
    )
    .count())

# ---- Sinks: imprime resultados por consola (modo update) ----
# q1 muestra la suma de montos por minuto
q1 = (agg_by_min
    .writeStream
    .outputMode("update")     # solo actualiza filas afectadas por nuevas llegadas
    .format("console")
    .option("truncate", "false")
    .option("numRows", 50)
    .start())

# q2 muestra conteos por estado/cliente por minuto
q2 = (agg_status_client
    .writeStream
    .outputMode("update")
    .format("console")
    .option("truncate", "false")
    .option("numRows", 50)
    .start())

# ---- Mantiene la app viva hasta que la detengas ----
spark.streams.awaitAnyTermination()

:: loading settings :: url = jar:file:/usr/local/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-50ab9f2c-c344-41ef-8ccf-4d727a9e7e00;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.2 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
downloading https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-

Scala runtime: version 2.12.18


25/09/14 04:44:11 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-19af53f2-a669-4885-a158-2f10742f3d4f. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/09/14 04:44:11 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/09/14 04:44:11 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-5c056d70-fe50-4621-9ae9-0b2e731ab48b. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/09/14 04:44:11 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not support

StreamingQueryException: [STREAM_FAILED] Query [id = ececb034-2ef4-4707-88d1-351a35b57ea1, runId = b405e03a-8681-41f6-92ad-5c71b6dc68d3] terminated with exception: org.apache.kafka.common.errors.UnknownTopicOrPartitionException: This server does not host this topic-partition.

25/09/14 04:44:17 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


## 