In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, FloatType

# Initialize Spark session
spark = SparkSession.builder \
    .appName("KafkaSparkStreaming") \
    .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.4")\
    .getOrCreate()

# Kafka configuration
kafka_broker = "127.0.0.1:9092"
kafka_topic = "Airfare_Prediction"

# Define the schema for incoming Kafka messages
schema = StructType([
    # StructField("Airline", StringType(), True),  # Corrected to StringType
    StructField("Journey_day", StringType(), True),
    StructField("Class", StringType(), True),
    StructField("Source", StringType(), True),
    StructField("Departure", StringType(), True),
    StructField("Total_stops", StringType(), True),
    StructField("Arrival", StringType(), True),
    StructField("Destination", StringType(), True),
    StructField("Duration_in_hours", FloatType(), True),
    StructField("Days_left", IntegerType(), True),
    StructField("Journey_month", IntegerType(), True),
    StructField("Weekend", IntegerType(), True),
    # StructField("Dep_min", IntegerType(), True),
    # StructField("Arrival_hour", IntegerType(), True),
    # StructField("Arrival_min", IntegerType(), True),
    # StructField("Price", DoubleType(), True),
])
# Load the trained model
model_path = "model"
try:
    model = PipelineModel.load(model_path)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Failed to load model: {e}")

kafka_stream = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_broker) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "latest") \
    .load()



# Deserialize and parse the Kafka messages
parsed_stream = kafka_stream.selectExpr("CAST(value AS STRING) as json") \
    .select(from_json(col("json"), schema).alias("data")) \
    .select("data.*")  # Extract nested fields

# Handle null values in the parsed stream
parsed_stream = parsed_stream.na.drop()  # Remove rows with null values



# Apply the trained model to the streaming data
predictions = model.transform(parsed_stream)

# Rename and select relevant columns
predictions = predictions.withColumnRenamed("prediction", "Predict_Price") \
    .select(
        col("Source"),
        col("Destination"),
        col("Predict_Price")
    )

# Output predictions to the console in a tabular format
query = predictions.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .option("numRows", 3) \
    .trigger(processingTime="5 seconds") \
    .start()

query.awaitTermination()


:: loading settings :: url = jar:file:/home/lamp/Projects/big-data/venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/lamp/.ivy2/cache
The jars for the packages stored in: /home/lamp/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d38ed935-6d3d-4f53-b275-8db9786842ae;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.4 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.4 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 264ms :: artifacts dl 8ms
	:: modu

Model loaded successfully!


25/01/23 01:10:34 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-1ab2f83c-795c-41b0-8e99-a8ba53b4a109. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/01/23 01:10:34 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/01/23 01:10:34 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


-------------------------------------------
Batch: 0
-------------------------------------------
+------+-----------+-------------+
|Source|Destination|Predict_Price|
+------+-----------+-------------+
+------+-----------+-------------+



25/01/23 01:10:36 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/01/23 01:10:37 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |11516.737578852053|
+------+-----------+------------------+

-------------------------------------------
Batch: 2
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |11574.124681295985|
|Delhi |Mumbai     |10942.421512258064|
+------+-----------+------------------+

-------------------------------------------
Batch: 3
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |11435.696201320678|
|Delhi |Mumbai     |16318.801133794286|
+------+-----------+------------------+



25/01/23 01:10:45 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


-------------------------------------------
Batch: 4
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |15894.852024490549|
|Delhi |Mumbai     |12138.82526277325 |
|Delhi |Mumbai     |15776.228415581698|
+------+-----------+------------------+

-------------------------------------------
Batch: 5
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |15776.228415581698|
|Delhi |Mumbai     |11874.455355672288|
+------+-----------+------------------+

-------------------------------------------
Batch: 6
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |12091.546713233676|
|Delhi |Mumbai     |11527.5

                                                                                

-------------------------------------------
Batch: 16
-------------------------------------------
+------+-----------+-----------------+
|Source|Destination|Predict_Price    |
+------+-----------+-----------------+
|Delhi |Mumbai     |8935.422254664778|
|Delhi |Mumbai     |8930.638967713408|
|Delhi |Mumbai     |8509.983500037299|
+------+-----------+-----------------+



                                                                                

-------------------------------------------
Batch: 17
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |8805.93285392542  |
|Delhi |Mumbai     |11537.912534883624|
+------+-----------+------------------+

-------------------------------------------
Batch: 18
-------------------------------------------
+------+-----------+-----------------+
|Source|Destination|Predict_Price    |
+------+-----------+-----------------+
|Delhi |Mumbai     |7155.428552001598|
|Delhi |Mumbai     |8805.93285392542 |
|Delhi |Mumbai     |8862.99877295151 |
+------+-----------+-----------------+



                                                                                

-------------------------------------------
Batch: 19
-------------------------------------------
+------+-----------+-----------------+
|Source|Destination|Predict_Price    |
+------+-----------+-----------------+
|Delhi |Mumbai     |12918.16008299022|
|Delhi |Mumbai     |8047.158922422682|
+------+-----------+-----------------+



ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/lamp/Projects/big-data/venv/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: reentrant call inside <_io.BufferedReader name=66>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/lamp/Projects/big-data/venv/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/lamp/Projects/big-data/venv/lib/python3.12/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/lamp

Py4JError: An error occurred while calling o453.awaitTermination

-------------------------------------------
Batch: 20
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |9010.376460880549 |
|Delhi |Mumbai     |11914.091409958268|
|Delhi |Mumbai     |13149.8624206656  |
+------+-----------+------------------+

-------------------------------------------
Batch: 21
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |15755.807576070885|
|Delhi |Mumbai     |15328.990419144515|
+------+-----------+------------------+



                                                                                

-------------------------------------------
Batch: 22
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |15116.084128636572|
|Delhi |Mumbai     |13996.712558970408|
|Delhi |Mumbai     |11418.730520905667|
+------+-----------+------------------+



                                                                                

-------------------------------------------
Batch: 23
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |12604.744297925406|
|Delhi |Mumbai     |13899.45560824933 |
+------+-----------+------------------+



                                                                                

-------------------------------------------
Batch: 24
-------------------------------------------
+------+-----------+-----------------+
|Source|Destination|Predict_Price    |
+------+-----------+-----------------+
|Delhi |Mumbai     |8862.99877295151 |
|Delhi |Mumbai     |9167.086019250839|
|Delhi |Mumbai     |14296.21749113198|
+------+-----------+-----------------+

-------------------------------------------
Batch: 25
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |13325.248795548992|
|Delhi |Mumbai     |13302.409136779961|
+------+-----------+------------------+

-------------------------------------------
Batch: 26
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |8953.310541854456 |
|Delhi |Mumbai     |14171.99235

                                                                                

-------------------------------------------
Batch: 27
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |14579.671449916636|
|Delhi |Mumbai     |14476.80830939426 |
+------+-----------+------------------+

-------------------------------------------
Batch: 28
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |11324.723095790616|
|Delhi |Mumbai     |8862.99877295151  |
|Delhi |Mumbai     |12827.534990485732|
+------+-----------+------------------+

-------------------------------------------
Batch: 29
-------------------------------------------
+------+-----------+------------------+
|Source|Destination|Predict_Price     |
+------+-----------+------------------+
|Delhi |Mumbai     |6802.4132790873855|
|Delhi |Mumbai     |8509