In [None]:
from kafka import KafkaProducer
import json
import time
import pandas as pd
from pyspark.sql import SparkSession
import numpy as np

# Kafka configuration
kafka_broker = "127.0.0.1:9092"
kafka_topic = "Airfare_Prediction"

# Initialize Kafka producer
producer = KafkaProducer(
    bootstrap_servers=kafka_broker,
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Load test set
test_set_path = "final.csv"  # Replace with the path to your test set
test_set = pd.read_csv(test_set_path)

# Initialize Spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .config("spark.jars.packages", "com.crealytics:spark-excel_2.12:0.13.5") \
    .getOrCreate()

# Extract month and handle other preprocessing
test_set["Journey_month"] = test_set['Date_of_journey'].str.split('-').str[1].astype(int)
test_set.drop(['Date_of_journey'], axis=1, inplace=True)
test_set["Weekend"] = np.where(test_set["Journey_day"] == "Sunday", 1, 0)
test_set.drop(['Airline'], axis=1, inplace=True)
test_set.drop(['Flight_code'], axis=1, inplace=True)

# Sort test set by 'Days_left' in descending order
# test_set = test_set.sort_values(by="Days_left", ascending=False)

df_test_pyspark = spark.createDataFrame(test_set)

# Streaming rows to Kafka based on 'Days_left'
for _, row in test_set.iterrows():
    days_left = row['Days_left']
    while days_left > 0:
        message = {
            "Journey_day": row.get("Journey_day", ""),
            "Class": row.get("Class", ""),
            "Source": row.get("Source", ""),
            "Departure": row.get("Departure", ""),
            "Total_stops": row.get("Total_stops", ""),
            "Arrival": row.get("Arrival", ""),
            "Destination": row.get("Destination", ""),
            "Duration_in_hours": row.get("Duration_in_hours", ""),
            "Days_left": days_left,
            "Journey_month": row.get("Journey_month", ""),
            "Weekend": row.get("Weekend", ""),
        }

        # Send message to Kafka
        producer.send(kafka_topic, value=message)
        print(f"Sent: {message}")

        days_left -= 1  # Decrement 'Days_left' for this row
        time.sleep(2)  # Adjust time interval for streaming

producer.close()


:: loading settings :: url = jar:file:/home/lamp/Projects/big-data/venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/lamp/.ivy2/cache
The jars for the packages stored in: /home/lamp/.ivy2/jars
com.crealytics#spark-excel_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-33b9143d-2d71-46a3-98a1-54045a035c29;1.0
	confs: [default]
	found com.crealytics#spark-excel_2.12;0.13.5 in central
	found org.apache.poi#poi;4.1.2 in central
	found commons-codec#commons-codec;1.13 in central
	found org.apache.commons#commons-collections4;4.4 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found com.zaxxer#SparseBitSet;1.2 in central
	found org.apache.poi#poi-ooxml;4.1.2 in central
	found org.apache.poi#poi-ooxml-schemas;4.1.2 in central
	found org.apache.xmlbeans#xmlbeans;3.1.0 in central
	found com.github.virtuald#curvesapi;1.06 in central
	found com.norbitltd#spoiwo_2.12;1.7.0 in central
	found org.scala-lang.modules#scala-xml_2.12;1.2.0 in central
	found com.github.pjfanning#excel-streaming-reader;2.3.4 in central
	foun

Sent: {'Journey_day': 'Sunday', 'Class': 'Economy', 'Source': 'Delhi', 'Departure': 'Before 6 AM', 'Total_stops': '1-stop', 'Arrival': '6 AM - 12 PM', 'Destination': 'Mumbai', 'Duration_in_hours': 5.25, 'Days_left': 7, 'Journey_month': 1, 'Weekend': 1}
Sent: {'Journey_day': 'Sunday', 'Class': 'Economy', 'Source': 'Delhi', 'Departure': 'Before 6 AM', 'Total_stops': '1-stop', 'Arrival': '6 AM - 12 PM', 'Destination': 'Mumbai', 'Duration_in_hours': 5.25, 'Days_left': 6, 'Journey_month': 1, 'Weekend': 1}
Sent: {'Journey_day': 'Sunday', 'Class': 'Economy', 'Source': 'Delhi', 'Departure': 'Before 6 AM', 'Total_stops': '1-stop', 'Arrival': '6 AM - 12 PM', 'Destination': 'Mumbai', 'Duration_in_hours': 5.25, 'Days_left': 5, 'Journey_month': 1, 'Weekend': 1}
Sent: {'Journey_day': 'Sunday', 'Class': 'Economy', 'Source': 'Delhi', 'Departure': 'Before 6 AM', 'Total_stops': '1-stop', 'Arrival': '6 AM - 12 PM', 'Destination': 'Mumbai', 'Duration_in_hours': 5.25, 'Days_left': 4, 'Journey_month': 1, 'W