In [5]:
pip install pyspark==3.5.2

Collecting pyspark==3.5.2
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812381 sha256=a2ead8b1b78ccf9606336a9c4172d6eeb34d37c4d52ca868c435faaf76fa2dd8
  Stored in directory: /root/.cache/pip/wheels/11/67/ea/33c283e520b775aa7a7a0d404447e287be841a711d074d4d91
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.7 pyspark-3.5.2
[0m
[1m[[0m[34;4

In [2]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

spark = (SparkSession.builder
         .config("spark.jars.packages",
                 "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.2")
         .appName("orders-streaming")
         .getOrCreate())

spark.sparkContext.setLogLevel("WARN")

schema = StructType([
    StructField("order_id", IntegerType()),
    StructField("customer_id", IntegerType()),
    StructField("amount", DoubleType()),
    StructField("status", StringType()),
    StructField("event_time", StringType()),
])

print("Scala runtime:", spark.sparkContext._jvm.scala.util.Properties.versionString())

kafka_df = (spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "orders")
  .load())

:: loading settings :: url = jar:file:/usr/local/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-749c110d-0811-4944-ba28-e8e1b5bdeb9c;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.2 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
downloading https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-

Scala runtime: version 2.12.18


25/09/14 04:14:51 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [None]:


json_df = (kafka_df
    .selectExpr("CAST(value AS STRING) as value")
    .select(from_json(col("value"), schema).alias("data"))
    .select("data.*"))

# 1) Agregación por ventana de 1 min (tumbling)
agg_by_min = (json_df
    .withColumn("ts", col("event_time").cast("timestamp"))
    .groupBy(window(col("ts"), "1 minute")).sum("amount")
    .withColumnRenamed("sum(amount)", "amount_sum"))

# 2) Conteo por status y cliente (en la misma ventana)
agg_status_client = (json_df
    .withColumn("ts", col("event_time").cast("timestamp"))
    .groupBy(window(col("ts"), "1 minute"), col("status"), col("customer_id")).count())

# Salida a consola
q1 = (agg_by_min
    .writeStream
    .outputMode("update")
    .format("console")
    .option("truncate", "false")
    .option("numRows", 50)
    .start())

q2 = (agg_status_client
    .writeStream
    .outputMode("update")
    .format("console")
    .option("truncate", "false")
    .option("numRows", 50)
    .start())

# Persistencia en Parquet
# out_path = "/opt/spark-app/output/orders_agg"
# q3 = (agg_by_min
#    .writeStream
#    .outputMode("update")
#    .format("parquet")
#    .option("path", out_path)
#    .option("checkpointLocation", "/opt/spark-app/output/_chk_orders_agg")
#    .start())

spark.streams.awaitAnyTermination()

25/09/14 04:15:09 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-c51aea88-6522-4595-a491-49b7ca34e032. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/09/14 04:15:09 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/09/14 04:15:09 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-12b1a90d-dacf-4cc6-946a-4db452d63510. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/09/14 04:15:09 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not support

-------------------------------------------
Batch: 0
-------------------------------------------
+------+------+-----------+-----+
|window|status|customer_id|count|
+------+------+-----------+-----+
+------+------+-----------+-----+



                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+----------+
|window|amount_sum|
+------+----------+
+------+----------+

