In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
from pyspark.ml import PipelineModel

In [None]:
# Start Spark session
spark = SparkSession.builder \
    .appName("MTA_Stream_Predict") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
    .getOrCreate()
spark.sparkContext.setLogLevel("WARN")

In [None]:
# Kafka Stream
kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "mta_turnstile_topic") \
    .option("startingOffsets", "latest") \
    .load()

turnstile_values = kafka_df.selectExpr("CAST(value AS STRING) as csv")
split_col = split(turnstile_values["csv"], ",")

turnstile_df = turnstile_values.select(
    split_col.getItem(3).alias("STATION")
).dropDuplicates(["STATION"])

In [None]:
# Load models (trained with handleInvalid='keep')
entries_model = PipelineModel.load("/Users/gopalakrishnaabba/mta_rf_model_entries")
exits_model   = PipelineModel.load("/Users/gopalakrishnaabba/mta_rf_model_exits")

In [None]:
# Predict entries and exits
entries_pred = entries_model.transform(turnstile_df).select("STATION", "prediction").withColumnRenamed("prediction", "predicted_ENTRIES")
exits_pred   = exits_model.transform(turnstile_df).select("STATION", "prediction").withColumnRenamed("prediction", "predicted_EXITS")

In [None]:
# Join predictions
joined_predictions = entries_pred.join(exits_pred, on="STATION", how="inner")

In [None]:
# Output to console
query = joined_predictions.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", False) \
    .start()

query.awaitTermination()
