In [2]:
from pyspark.ml import PipelineModel

model_path = "gs://model_stored_for_pred_forecast/daily_prediction/"
model = PipelineModel.load(model_path)

                                                                                

In [3]:
type(model)

pyspark.ml.pipeline.PipelineModel

In [4]:
model.stages

[VectorAssembler_dc8a971570d3, SparkXGBRegressor_740fba29192e]

In [5]:
# model.stages[0].getInputCols()  # Shows the input features

In [6]:
# Configure the BigQuery table
project_id = "lively-encoder-448916-d5"
dataset_id = "nyc_subway"
table_id = "daily_future_pred_input"

# Read the BigQuery table into a Spark DataFrame
input_daily_df = spark.read \
    .format("bigquery") \
    .option("table", f"{project_id}.{dataset_id}.{table_id}") \
    .option("parentProject", project_id) \
    .load()

# Show the DataFrame
input_daily_df.show(1)

25/04/19 06:33:47 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+------------------+---------------------+-------------+--------------------+-----------+------------------+-------------------+-------------+------------------+-------------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+-------------------+---------------------+---------------------+-----------------------+-----------------------+-----------------+------------------+--------------------+----------------------+----------------------+------------------------+------------------------+
|transit_mode_i

In [7]:
output_predictions = model.transform(input_daily_df)
output_predictions = output_predictions.select('transit_mode_index', 'station_complex_index', 'borough_index', 'payment_method_index', 'prediction')

In [8]:
from pyspark.sql.functions import col, lit, create_map, round

# Map encoded categorical columns to their original values
payment_method_mapping = {1: 'metrocard', 2: 'omny'}
transit_mode_mapping = {1: 'staten_island_railway', 2: 'subway', 3: 'tram'}
borough_mapping = {1: 'Bronx', 2: 'Brooklyn', 3: 'Manhattan', 4: 'Queens', 5: 'Staten Island'}

station_df = spark.read.csv("gs://bucket_jars/station_complex_csv/station_complex.csv", header=True)
station_df = station_df.withColumn("station_complex_index", col("station_complex_index").cast("int"))
station_df = station_df.select("station_complex_index", "station_complex")
# station_df.printSchema()

station_map = station_df.rdd.collectAsMap()
# print(station_mapping)
station_mapping = [
    item for pair in station_map.items() for item in (lit(pair[0]), lit(pair[1]))
]

# Create mapping expressions
payment_method_expr = create_map([lit(k) for pair in payment_method_mapping.items() for k in pair])
transit_mode_expr = create_map([lit(k) for pair in transit_mode_mapping.items() for k in pair])
borough_expr = create_map([lit(k) for pair in borough_mapping.items() for k in pair])
station_complex_expr = create_map(*station_mapping)

# Apply the mappings
predictions_df = output_predictions.withColumn(
                        "payment_method", 
                        payment_method_expr[col("payment_method_index").cast("int")]
                    ).withColumn(
                        "transit_mode", 
                        transit_mode_expr[col("transit_mode_index").cast("int")]
                    ).withColumn(
                        "borough", 
                        borough_expr[col("borough_index").cast("int")]
                    ).withColumn(
                        "station_complex", 
                        station_complex_expr[col("station_complex_index").cast("int")]
                    )

predictions_df = predictions_df.select('transit_mode', 'station_complex', 'borough', 'payment_method', 'prediction')
predictions_df = predictions_df.withColumn("prediction", round("prediction", 0))
predictions_df = predictions_df.withColumn("prediction", col("prediction").cast("integer"))
predictions_df.show(5, truncate=False)

[Stage 9:>                                                          (0 + 1) / 1]

+------------+--------------------------------------------+--------+--------------+----------+
|transit_mode|station_complex                             |borough |payment_method|prediction|
+------------+--------------------------------------------+--------+--------------+----------+
|subway      |Hunts Point Av (6)                          |Bronx   |metrocard     |2026      |
|subway      |25 Av (D)                                   |Brooklyn|metrocard     |1238      |
|subway      |Atlantic Av-Barclays Ctr (B,D,N,Q,R,2,3,4,5)|Brooklyn|metrocard     |8411      |
|subway      |Avenue P (F)                                |Brooklyn|omny          |771       |
|subway      |Church Av (F,G)                             |Brooklyn|omny          |2940      |
+------------+--------------------------------------------+--------+--------------+----------+
only showing top 5 rows



                                                                                

In [9]:
# Write predictions to BigQuery
predictions_df.write.format("bigquery") \
    .option("table", "lively-encoder-448916-d5.nyc_subway.daily_future_pred_output") \
    .option("temporaryGcsBucket", "temp_nyc_bucket_for_bq") \
    .mode("overwrite") \
    .save()

                                                                                