In [1]:
from kafka import KafkaProducer
import json
import time
import pandas as pd
from pyspark.sql import SparkSession

# Kafka configuration
kafka_broker = "127.0.0.1:9092"
kafka_topic = "Airfare_Prediction"

# Initialize Kafka producer
producer = KafkaProducer(
    bootstrap_servers=kafka_broker,
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Load test set
test_set_path = "Cleaned_dataset.csv"  # Replace with the path to your test set
test_set = pd.read_csv(test_set_path)

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .config("spark.jars.packages", "com.crealytics:spark-excel_2.12:0.13.5") \
    .getOrCreate()


import numpy as np
test_set["Journey_month"] = test_set['Date_of_journey'].str.split('-').str[1].astype(int)
test_set.drop(['Date_of_journey'], axis = 1, inplace = True)
test_set["Weekend"] = np.where(test_set["Journey_day"] == "Sunday", 1, 0)
test_set.drop(['Airline'], axis = 1, inplace = True)
test_set.drop(['Flight_code'], axis = 1, inplace = True)
df_test_pyspark = spark.createDataFrame(test_set)

for _, row in test_set.iterrows():
    message = {
    "Journey_day": row.get("Journey_day", ""),
    "Class": row.get("Class", ""),
    "Source": row.get("Source", ""),
    "Departure": row.get("Departure", ""),
    "Total_stops": row.get("Total_stops", ""),
    "Arrival": row.get("Arrival", ""),
    "Destination": row.get("Destination", ""),
    "Duration_in_hours": row.get("Duration_in_hours", ""),
    "Days_left": row.get("Days_left", ""),
    "Journey_month": row.get("Journey_month", ""),
    "Weekend": row.get("Weekend", ""),
}

    producer.send(kafka_topic, value=message)
    print(f"Sent: {message}")
    time.sleep(2)  # Reduce time interval
producer.close()



In [None]:
from kafka import KafkaProducer
import json
import time
import pandas as pd
from pyspark.sql import SparkSession
import numpy as np

# Kafka configuration
kafka_broker = "127.0.0.1:9092"
kafka_topic = "Airfare_Prediction"

# Initialize Kafka producer
producer = KafkaProducer(
    bootstrap_servers=kafka_broker,
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Load test set
test_set_path = "Cleaned_dataset.csv"  # Replace with the path to your test set
test_set = pd.read_csv(test_set_path)

# Initialize Spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .config("spark.jars.packages", "com.crealytics:spark-excel_2.12:0.13.5") \
    .getOrCreate()

# Extract month and handle other preprocessing
test_set["Journey_month"] = test_set['Date_of_journey'].str.split('-').str[1].astype(int)
test_set.drop(['Date_of_journey'], axis=1, inplace=True)
test_set["Weekend"] = np.where(test_set["Journey_day"] == "Sunday", 1, 0)
test_set.drop(['Airline'], axis=1, inplace=True)
test_set.drop(['Flight_code'], axis=1, inplace=True)

# Sort test set by 'Days_left' in descending order
test_set = test_set.sort_values(by="Days_left", ascending=False)

df_test_pyspark = spark.createDataFrame(test_set)

# Streaming rows to Kafka based on 'Days_left'
for index, row in test_set[::-1].iterrows():  # Reverse iteration
    if row['Days_left'] > 1:
        # Decrease 'Days_left' first
        test_set.loc[index, 'Days_left'] -= 1  # Reduce the 'Days_left' value by 1

        # Now, construct the message with the updated 'Days_left'
        updated_days_left = test_set.loc[index, 'Days_left']
        
        message = {
            "Journey_day": row.get("Journey_day", ""),
            "Class": row.get("Class", ""),
            "Source": row.get("Source", ""),
            "Departure": row.get("Departure", ""),
            "Total_stops": row.get("Total_stops", ""),
            "Arrival": row.get("Arrival", ""),
            "Destination": row.get("Destination", ""),
            "Duration_in_hours": row.get("Duration_in_hours", ""),
            "Days_left": int(updated_days_left),  # Use the updated 'Days_left' value
            "Journey_month": row.get("Journey_month", ""),
            "Weekend": row.get("Weekend", ""),
        }

        # Send message to Kafka
        producer.send(kafka_topic, value=message)
        print(f"Sent: {message}")

        # Stop streaming once 'Days_left' becomes 1 or less
        if updated_days_left <= 1:
            # Set the row to have Days_left = 1 and continue
            test_set.loc[index, 'Days_left'] = 1
            break  # Stop the loop when Days_left is 1 or less

        time.sleep(2)  # Reduce time interval for streaming
    elif row['Days_left'] == 1:
        # Send message to Kafka if Days_left is 1
        message = {
            "Journey_day": row.get("Journey_day", ""),
            "Class": row.get("Class", ""),
            "Source": row.get("Source", ""),
            "Departure": row.get("Departure", ""),
            "Total_stops": row.get("Total_stops", ""),
            "Arrival": row.get("Arrival", ""),
            "Destination": row.get("Destination", ""),
            "Duration_in_hours": row.get("Duration_in_hours", ""),
            "Days_left": int(1),  # 'Days_left' is 1 for this row
            "Journey_month": row.get("Journey_month", ""),
            "Weekend": row.get("Weekend", ""),
        }

        # Send message to Kafka
        producer.send(kafka_topic, value=message)
        print(f"Sent: {message}")
        continue  # Stop once we reach the row with 'Days_left' == 1

producer.close()


Sent: {'Journey_day': 'Monday', 'Class': 'Economy', 'Source': 'Delhi', 'Departure': 'After 6 PM', 'Total_stops': 'non-stop', 'Arrival': 'Before 6 AM', 'Destination': 'Mumbai', 'Duration_in_hours': 2.25, 'Days_left': 1, 'Journey_month': 1, 'Weekend': 0}
