RAW DATA PULLING FROM KAFKA AS LIVE AND STORING IN DELTA FORMAT

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import datetime

In [2]:
#Spark session with Delta Lake and Kafka support

spark = SparkSession.builder \
    .appName("raw_train_live_status") \
    .config("spark.jars.packages",
            "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0,"
            "org.apache.kafka:kafka-clients:3.4.0,"
            "io.delta:delta-spark_2.13:4.0.0") \
    .config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.databricks.delta.properties.defaults.enableDeletionVectors", "true") \
    .master("local[*]") \
    .getOrCreate()

In [3]:
#live_data_schema

live_data_schema = StructType([
    StructField("eventId",StringType(),True),
    StructField("trainId",StringType(),True),
    StructField("eventType",StringType(),True),
    StructField("stationCode",StringType(),True),
    StructField("distanceCovered",StringType(),True),
    StructField("totalDistance",StringType(),True),
    StructField("noOfDays",StringType(),True),
    StructField("actualArrivalTime",StringType(),True),
    StructField("expectedArrivalTime",StringType(),True),
    StructField("actualDepartureTime",StringType(),True),
    StructField("expectedDepartureTime",StringType(),True),
    StructField("createdDate",StringType(),True)
])

In [5]:
# Read data from Kafka topic

kafka_df = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "train_live_status_stream_data") \
    .option("startingOffsets", "earliest") \
    .load()


In [6]:
kafka_df.show(truncate=False)

+----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+---------+------+-----------------------+-------------+
|key |value                                                                                                                                                                                                                                                                                                                                                                 |topic                        |partition|offset|timestamp              |timestampType|
+----+--------------------------------------------------------------------------------------------

In [20]:
val_Df = kafka_df.selectExpr("CAST(value AS STRING)")
exploded_df = val_Df\
    .withColumn("record",from_json(col("value"), live_data_schema))\


val_Df.show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                  |
+-----------------------------------------------------------------------------------------------------------------------+
|17b5b0e8-38b0-4665-afb0-1c04be499483,5558384,Departure,PR,0,1243,4,nan,nan,05:27,06:04,2025-07-10 09:58:52.767700      |
|8648ec81-323d-4cdd-9d4d-de079fc378e9,5499299,Departure,MSD,0,568,3,nan,nan,14:15,14:45,2025-07-10 09:58:52.768002      |
|43a648c0-0051-40c4-8b3e-9dba997ecba6,2209869,Departure,G,0,1184,3,nan,nan,03:33,03:33,2025-07-10 09:58:52.768613       |
|d8f2fb2e-675e-4a5c-9d89-5d40033f56d5,4437388,Departure,GV,0,1622,3,nan,nan,21:32,21:32,2025-07-10 09:58:52.769550      |
|aa9a270d-a144-418b-878d-cffe251c49c4,8617405,Departure,KJMG,0,1298,4,nan,nan,06:28,06:28,2025-07-10 09:58:52.769839    |
|5b5aeb67-79fb-4ec0-b3a2

In [None]:
current_date = datetime.now().date()

kafka_df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .trigger(processingTime='5 seconds') \
    .option("checkpointLocation", f"raw_data/raw_live_data/{current_date}/checkpoints") \
    .option("truncate", "false") \
    .start(f"raw_data/raw_live_data/{current_date}/trainlivestatus") 