In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from delta import *

In [None]:
# Session and Configuration

# spark = SparkSession.builder \
#     .appName("raw_train_live_status") \
#     .config("spark.jars", "C:/Program Files/Spark/spark-jars/spark-sql-kafka-0-10_2.13-4.0.0.jar,"
#                       "C:/Program Files/Spark/spark-jars/kafka-clients-3.4.0.jar,"
#                       "C:/Program Files/Spark/spark-jars/commons-pool2-2.11.1.jar,"
#                       "C:/Program Files/Spark/spark-jars/delta-spark_2.13-4.0.0.jar") \
#     .config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension") \
#     .config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog") \
#     .master("local[*]") \
#     .getOrCreate()

# print(spark)

<pyspark.sql.session.SparkSession object at 0x00000157669B8830>


In [5]:
spark = SparkSession.builder \
    .appName("raw_train_live_status") \
    .config("spark.jars.packages",
            "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0,"
            "org.apache.kafka:kafka-clients:3.4.0,"
            "io.delta:delta-spark_2.13:4.0.0") \
    .config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .master("local[*]") \
    .getOrCreate()

In [39]:


kafka_schem = StructType([
    StructField("schema_type", StringType(), True),
    StructField("data", StringType(), True),
])

#TrainMaster schema
train_master_schema =ArrayType(StructType([
    StructField("trainId", IntegerType(), True),
    StructField("trainNo", IntegerType(), True),
    StructField("category", StringType(), True),
    StructField("operator", StringType(), True)
]))

#StationInfo schema
station_info_schema = ArrayType(StructType([
    StructField("stationId", IntegerType(), True),
    StructField("stationName", StringType(), True),
    StructField("stationCode", StringType(), True),
    StructField("latitude", StringType(), True),   # or DoubleType() if values are numeric
    StructField("longitude", StringType(), True)   # or DoubleType() if values are numeric

]))

# TrainLiveStatus schema
train_schedule_time_schema = ArrayType(StructType([
    StructField("scheduleId", LongType(), True),
    StructField("trainId", StringType(), True),
    StructField("trainNumber", IntegerType(), True),
    StructField("trainName", StringType(), True),
    StructField("scheduledArrivalTime", StringType(), True),
    StructField("scheduledDepartureTime", StringType(), True),
    StructField("stationCodes", ArrayType(StringType()), True),
    StructField("createdDate", StringType(), True)
]))

In [11]:
kafka_df = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "train_station_info_batch_data") \
    .option("startingOffsets", "earliest") \
    .load()


In [None]:
decoded_df = kafka_df.selectExpr("CAST(value AS STRING) as json_string")
decoded_df = decoded_df.withColumn("data", from_json(col("json_string"), kafka_schem)).drop("json_string")
decoded_df.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- schema_type: string (nullable = true)
 |    |-- data: string (nullable = true)

+--------------------+
|                data|
+--------------------+
|{stationInfo, [{"...|
|{trainMaster, [{"...|
|{trainScheduleTim...|
+--------------------+



In [None]:
flattened_df = decoded_df \
    .withColumn("schema_type", col("data.schema_type")) \
    .withColumn("record", col("data.data")) \
    .drop("data") 

flattened_df.printSchema()

+-----------------+--------------------+
|      schema_type|              record|
+-----------------+--------------------+
|      stationInfo|[{"stationId":1,"...|
|      trainMaster|[{"trainId":70794...|
|trainScheduleTime|[{"scheduleId":"4...|
+-----------------+--------------------+



In [40]:
station_df = flattened_df.filter(col("schema_type") == "stationInfo")\
    .withColumn("record",from_json(col("record"), station_info_schema).alias("stationInfo"))\
    .withColumn("record",explode_outer(col("record")))\
    .select("record.*")


  
station_df.show()


+---------+--------------+-----------+--------------+---------------+
|stationId|   stationName|stationCode|      latitude|      longitude|
+---------+--------------+-----------+--------------+---------------+
|        1|    Churchgate|        CCG|-20.1492537313|  17.0149253731|
|        2|  Marine Lines|        MEL| 47.0149253731| -27.7611940299|
|        3|   Charni Road|        CYR| -74.776119403|  70.7462686567|
|        4|    Grant Road|        GTR|-10.2985074627|-124.4776119403|
|        5|Mumbai Central|       MMCT| 38.9552238806|  -2.6865671642|
|        6|   Mahalakshmi|         MX|-37.1641791045| 160.2985074627|
|        7|   Lower Parel|         PL|          30.0| 172.8358208955|
|        8|    Prabhadevi|       PBHD|-51.4925373134| 117.3134328358|
|        9|         Dadar|   D/DR/DDR| 55.9701492537|-142.3880597015|
|       10|  Matunga Road|        MRU|-85.5223880597|   4.4776119403|
|       11|      Mahim Jn|         MM|  2.2388059701|-144.1791044776|
|       12|        B

In [8]:
decoded_df.writeStream \
    .format("json") \
    .option("path", "output_data/") \
    .option("checkpointLocation", "chk_dir/") \
    .start()

<pyspark.sql.streaming.query.StreamingQuery at 0x19766b61940>

In [None]:
query = decoded_df.writeStream \
    .format("console") \
    .trigger(processingTime='5 seconds') \
    .outputMode("append") \
    .start()

query.awaitTermination()