In [8]:
# streaming_iot.py

from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, avg, window, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

# Créer une session Spark Streaming
spark = SparkSession.builder.appName("Streaming-IoT").getOrCreate()


In [None]:
# Lecture du flux depuis fichier JSON (simulation streaming)
# Remplace le flux socket par la lecture progressive du fichier
schema = StructType([
    StructField("timestamp", StringType()),
    StructField("sensor_id", StringType()),
    StructField("equipment_id", StringType()),
    StructField("temperature", DoubleType()),
    StructField("vibration", DoubleType()),
    StructField("pressure", DoubleType()),
    StructField("status", StringType())
])

In [11]:
df_stream = (
    spark.readStream
    .format("json")
    .schema(schema)
    .option("maxFilesPerTrigger", 1)  # simule un flux temps réel
    .load("./data/sensors_stream.json")
)

In [12]:
# Parsing JSON et conversion timestamp
df_parsed = df_stream.withColumn("ts", to_timestamp(col("timestamp")))

# Agrégation sur fenêtre glissante 1 minute, slide 30s
df_agg = df_parsed.withWatermark("ts", "1 minute").groupBy(
    window(col("ts"), "1 minute", "30 seconds"),
    col("sensor_id")
).agg(
    avg("temperature").alias("avg_temp"),
    avg("vibration").alias("avg_vibration"),
    avg("pressure").alias("avg_pressure")
)


In [21]:
# Sortie vers console (pour debug / visualisation)
query_console = df_agg.writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", False) \
    .start()

In [None]:
# Sauvegarde en Parquet
query_parquet = df_agg.writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "output_lab4/streaming_temp.parquet") \
    .option("checkpointLocation", "checkpoints/streaming_temp") \
    .start()


In [None]:
# Attente de la fin du streaming

query_console.awaitTermination()
query_parquet.awaitTermination()
