In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, DoubleType, TimestampType

In [3]:
# Initialize SparkSession with the Kafka JAR
spark = SparkSession.builder \
    .appName("KafkaTaxiStream") \
    .config("spark.jars", "/home/jovyan/jars/spark-sql-kafka-0-10_2.12-3.5.0.jar,/home/jovyan/jars/kafka-clients-3.5.1.jar") \
    .getOrCreate()

print("✅ Spark Session created successfully!")

✅ Spark Session created successfully!


In [4]:
# Define Schema for Incoming Data
schema = StructType() \
    .add("medallion", StringType()) \
    .add("hack_license", StringType()) \
    .add("pickup_datetime", TimestampType()) \
    .add("dropoff_datetime", TimestampType()) \
    .add("trip_time_in_secs", DoubleType()) \
    .add("trip_distance", DoubleType()) \
    .add("pickup_longitude", DoubleType()) \
    .add("pickup_latitude", DoubleType()) \
    .add("dropoff_longitude", DoubleType()) \
    .add("dropoff_latitude", DoubleType()) \
    .add("payment_type", StringType()) \
    .add("fare_amount", DoubleType()) \
    .add("surcharge", DoubleType()) \
    .add("mta_tax", DoubleType()) \
    .add("tip_amount", DoubleType()) \
    .add("tolls_amount", DoubleType())


In [5]:
# Read Data Stream from Kafka
taxi_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "taxi-trips") \
    .option("startingOffsets", "earliest") \
    .load()

# Parse JSON Data from Kafka
parsed_data = taxi_stream.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

# Display Streaming Data in Console
#query = parsed_data.writeStream \
#    .outputMode("append") \
#    .format("console") \
#    .start()

query = parsed_data.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("taxi_trips_table") \
    .start()

#query.awaitTermination()

In [6]:
spark.sql("SELECT * FROM taxi_trips_table LIMIT 10").show()

+---------+------------+---------------+----------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+
|medallion|hack_license|pickup_datetime|dropoff_datetime|trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|
+---------+------------+---------------+----------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+
+---------+------------+---------------+----------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+



In [7]:
query.exception()

pyspark.errors.exceptions.captured.StreamingQueryException('org/apache/spark/kafka010/KafkaConfigUpdater\n=== Streaming Query ===\nIdentifier: taxi_trips_table [id = 45a2c580-bbcf-475d-860e-b60c44700583, runId = 33090615-6237-4592-a7cc-3fe7b40a5534]\nCurrent Committed Offsets: {}\nCurrent Available Offsets: {}\n\nCurrent State: INITIALIZING\nThread State: RUNNABLE',
                                                           'org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:332)\n\t at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.$anonfun$run$1(StreamExecution.scala:211)\n\t at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)\n\t at org.apache.spark.JobArtifactSet$.withActiveJobArtifactState(JobArtifactSet.scala:94)\n\t at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:211)',
                              

In [8]:
query.awaitTermination()

StreamingQueryException: [STREAM_FAILED] Query [id = 45a2c580-bbcf-475d-860e-b60c44700583, runId = 33090615-6237-4592-a7cc-3fe7b40a5534] terminated with exception: org/apache/spark/kafka010/KafkaConfigUpdater

In [9]:
spark.sql("SHOW TABLES").show()

+---------+----------------+-----------+
|namespace|       tableName|isTemporary|
+---------+----------------+-----------+
|         |taxi_trips_table|       true|
+---------+----------------+-----------+



In [10]:
parsed_data.isStreaming

True