RAW DATA PULLING FROM KAFKA AS LIVE AND STORING IN DELTA FORMAT

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import datetime

In [2]:
#Spark session with Delta Lake and Kafka support

spark = SparkSession.builder \
    .appName("raw_train_live_status") \
    .config("spark.jars.packages",
            "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0,"
            "org.apache.kafka:kafka-clients:3.4.0,"
            "io.delta:delta-spark_2.13:4.0.0") \
    .config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.databricks.delta.properties.defaults.enableDeletionVectors", "true") \
    .master("local[*]") \
    .getOrCreate()

In [3]:
#live_data_schema

live_data_schema = StructType([
    StructField("eventId",StringType(),True),
    StructField("trainId",StringType(),True),
    StructField("eventType",StringType(),True),
    StructField("stationCode",StringType(),True),
    StructField("distanceCovered",StringType(),True),
    StructField("totalDistance",StringType(),True),
    StructField("noOfDays",StringType(),True),
    StructField("actualArrivalTime",StringType(),True),
    StructField("expectedArrivalTime",StringType(),True),
    StructField("actualDepartureTime",StringType(),True),
    StructField("expectedDepartureTime",StringType(),True),
    StructField("createdDate",StringType(),True)
])

In [4]:
# Read data from Kafka topic

kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "train_live_status_stream_data") \
    .option("startingOffsets", "earliest") \
    .load()


In [5]:
val_Df = kafka_df.selectExpr("CAST(value AS STRING)")

exploded_df = val_Df\
    .withColumn("record",from_json(col("value"), live_data_schema))\
    .select("record.*")



In [None]:
current_date = datetime.now().date()

exploded_df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .trigger(processingTime='5 seconds') \
    .option("checkpointLocation", f"raw_data/raw_live_data/{current_date}/checkpoints") \
    .option("truncate", "false") \
    .start(f"raw_data/raw_live_data/{current_date}/trainlivestatus")\
    .awaitTermination() 

# use awaitTermination() to keep the stream running and interrupt the python kernel whenever needed
# or use stop() to stop the stream when needed

In [None]:

# ONLY FOR TESTING PURPOSES

# current_date = datetime.now().date()
# df = spark.read.format("delta") \
#     .load(f"file:///d:/SparkFolder/public-transport-delay-tracker/sparkApp/raw_data/raw_live_data/{current_date}/trainlivestatus")
# df.show()