In [61]:
from pyspark.ml import PipelineModel
from pyspark.sql.functions import col, lit, to_date
from pyspark.sql.types import IntegerType

model_path = "gs://model_stored_for_pred_forecast/daily_prediction/"
model = PipelineModel.load(model_path)

In [62]:
from google.cloud import bigquery

# Initialize BigQuery client
client = bigquery.Client()

In [63]:
# Step 1: Create dataset with recent dates (MAX_DATE - 30 for moving avg calculation) 
query_recent_data = """
                    CREATE OR REPLACE TABLE `lively-encoder-448916-d5.nyc_subway.recent_dates_data`
                    AS
                    SELECT
                    transit_date, transit_mode_index,
                    station_complex_index, borough_index,
                    payment_method_index, ridership AS prediction,
                    day_of_week, day_of_week_sin, day_of_week_cos,
                    week_of_month, week_of_month_sin, week_of_month_cos,
                    ridership_lag_1, ridership_lag_2, ridership_lag_3, ridership_lag_4, ridership_lag_5,
                    ridership_lag_6, ridership_lag_7, ridership_lag_8, ridership_lag_9, ridership_lag_10,
                    ridership_lag_11, ridership_lag_12, ridership_lag_13, ridership_lag_14, ridership_lag_15,
                    ridership_lag_16, ridership_lag_17, ridership_lag_18, ridership_lag_19, ridership_lag_20,
                    ridership_lag_21, ridership_lag_22, ridership_lag_23, ridership_lag_24, ridership_lag_25,
                    ridership_lag_26, ridership_lag_27, ridership_lag_28, ridership_lag_29, ridership_lag_30,
                    ridership_7d_mv,
                    day_of_week_7d_mv, week_of_month_7d_mv,
                    day_of_week_sin_7d_mv, day_of_week_cos_7d_mv,
                    week_of_month_sin_7d_mv, week_of_month_cos_7d_mv,
                    ridership_30d_mv,
                    day_of_week_30d_mv, week_of_month_30d_mv,
                    day_of_week_sin_30d_mv, day_of_week_cos_30d_mv,
                    week_of_month_sin_30d_mv, week_of_month_cos_30d_mv
                    FROM `lively-encoder-448916-d5.nyc_subway.date_model`
                    WHERE transit_date BETWEEN
                        (SELECT MAX(transit_date) - 29 FROM `lively-encoder-448916-d5.nyc_subway.date_model`)
                        AND
                        (SELECT MAX(transit_date) FROM `lively-encoder-448916-d5.nyc_subway.date_model`)
                    ORDER BY transit_date
                    """

In [64]:
# Step 2: Calculate lags
query_lags = """
            CREATE OR REPLACE TABLE `lively-encoder-448916-d5.nyc_subway.daily_forecast_lags`
            AS
            SELECT
                (SELECT MAX(transit_date)+1 FROM `lively-encoder-448916-d5.nyc_subway.recent_dates_data`) AS transit_date,
                transit_mode_index, station_complex_index, borough_index, payment_method_index,
                prediction as ridership_lag_1,
                ridership_lag_1 as ridership_lag_2,
                ridership_lag_2 as ridership_lag_3,
                ridership_lag_3 as ridership_lag_4,
                ridership_lag_4 as ridership_lag_5,
                ridership_lag_5 as ridership_lag_6,
                ridership_lag_6 as ridership_lag_7,
                ridership_lag_7 as ridership_lag_8,
                ridership_lag_8 as ridership_lag_9,
                ridership_lag_9 as ridership_lag_10,
                ridership_lag_10 as ridership_lag_11,
                ridership_lag_11 as ridership_lag_12,
                ridership_lag_12 as ridership_lag_13,
                ridership_lag_13 as ridership_lag_14,
                ridership_lag_14 as ridership_lag_15,
                ridership_lag_15 as ridership_lag_16,
                ridership_lag_16 as ridership_lag_17,
                ridership_lag_17 as ridership_lag_18,
                ridership_lag_18 as ridership_lag_19,
                ridership_lag_19 as ridership_lag_20,
                ridership_lag_20 as ridership_lag_21,
                ridership_lag_21 as ridership_lag_22,
                ridership_lag_22 as ridership_lag_23,
                ridership_lag_23 as ridership_lag_24,
                ridership_lag_24 as ridership_lag_25,
                ridership_lag_25 as ridership_lag_26,
                ridership_lag_26 as ridership_lag_27,
                ridership_lag_27 as ridership_lag_28,
                ridership_lag_28 as ridership_lag_29,
                ridership_lag_29 as ridership_lag_30
            FROM `lively-encoder-448916-d5.nyc_subway.recent_dates_data`
            WHERE transit_date = (SELECT MAX(transit_date) FROM `lively-encoder-448916-d5.nyc_subway.recent_dates_data`)
            LIMIT 5000
            """

In [65]:
# Step 3: Calculate moving averages
query_ma = """
            CREATE OR REPLACE TABLE `lively-encoder-448916-d5.nyc_subway.daily_forecast_moving_avg`
            AS (
            WITH ridership_moving_avg AS (
                SELECT
                    transit_date,
                    transit_mode_index,
                    station_complex_index,
                    borough_index,
                    payment_method_index,
                    -- 7-day moving average
                    AVG(prediction) OVER (
                        PARTITION BY station_complex_index, transit_mode_index, borough_index, payment_method_index
                        ORDER BY transit_date
                        ROWS BETWEEN 6 PRECEDING AND 0 PRECEDING
                    ) AS ridership_7d_mv,
                    AVG(day_of_week) OVER (
                        PARTITION BY station_complex_index, transit_mode_index, borough_index, payment_method_index
                        ORDER BY transit_date
                        ROWS BETWEEN 6 PRECEDING AND 0 PRECEDING
                    ) AS day_of_week_7d_mv,
                    AVG(week_of_month) OVER (
                        PARTITION BY station_complex_index, transit_mode_index, borough_index, payment_method_index
                        ORDER BY transit_date
                        ROWS BETWEEN 6 PRECEDING AND 0 PRECEDING
                    ) AS week_of_month_7d_mv,
                    AVG(day_of_week_sin) OVER (
                        PARTITION BY station_complex_index, transit_mode_index, borough_index, payment_method_index
                        ORDER BY transit_date
                        ROWS BETWEEN 6 PRECEDING AND 0 PRECEDING
                    ) AS day_of_week_sin_7d_mv,
                    AVG(day_of_week_cos) OVER (
                        PARTITION BY station_complex_index, transit_mode_index, borough_index, payment_method_index
                        ORDER BY transit_date
                        ROWS BETWEEN 6 PRECEDING AND 0 PRECEDING
                    ) AS day_of_week_cos_7d_mv,
                    AVG(week_of_month_sin) OVER (
                        PARTITION BY station_complex_index, transit_mode_index, borough_index, payment_method_index
                        ORDER BY transit_date
                        ROWS BETWEEN 6 PRECEDING AND 0 PRECEDING
                    ) AS week_of_month_sin_7d_mv,
                    AVG(week_of_month_cos) OVER (
                        PARTITION BY station_complex_index, transit_mode_index, borough_index, payment_method_index
                        ORDER BY transit_date
                        ROWS BETWEEN 6 PRECEDING AND 0 PRECEDING
                    ) AS week_of_month_cos_7d_mv,
                    -- 30-day moving average
                    AVG(prediction) OVER (
                        PARTITION BY station_complex_index, transit_mode_index, borough_index, payment_method_index
                        ORDER BY transit_date
                        ROWS BETWEEN 29 PRECEDING AND 0 PRECEDING
                    ) AS ridership_30d_mv,
                    AVG(day_of_week) OVER (
                        PARTITION BY station_complex_index, transit_mode_index, borough_index, payment_method_index
                        ORDER BY transit_date
                        ROWS BETWEEN 29 PRECEDING AND 0 PRECEDING
                    ) AS day_of_week_30d_mv,
                    AVG(week_of_month) OVER (
                        PARTITION BY station_complex_index, transit_mode_index, borough_index, payment_method_index
                        ORDER BY transit_date
                        ROWS BETWEEN 29 PRECEDING AND 0 PRECEDING
                    ) AS week_of_month_30d_mv,
                    AVG(day_of_week_sin) OVER (
                        PARTITION BY station_complex_index, transit_mode_index, borough_index, payment_method_index
                        ORDER BY transit_date
                        ROWS BETWEEN 29 PRECEDING AND 0 PRECEDING
                    ) AS day_of_week_sin_30d_mv,
                    AVG(day_of_week_cos) OVER (
                        PARTITION BY station_complex_index, transit_mode_index, borough_index, payment_method_index
                        ORDER BY transit_date
                        ROWS BETWEEN 29 PRECEDING AND 0 PRECEDING
                    ) AS day_of_week_cos_30d_mv,
                    AVG(week_of_month_sin) OVER (
                        PARTITION BY station_complex_index, transit_mode_index, borough_index, payment_method_index
                        ORDER BY transit_date
                        ROWS BETWEEN 29 PRECEDING AND 0 PRECEDING
                    ) AS week_of_month_sin_30d_mv,
                    AVG(week_of_month_cos) OVER (
                        PARTITION BY station_complex_index, transit_mode_index, borough_index, payment_method_index
                        ORDER BY transit_date
                        ROWS BETWEEN 29 PRECEDING AND 0 PRECEDING
                    ) AS week_of_month_cos_30d_mv
                FROM `lively-encoder-448916-d5.nyc_subway.recent_dates_data`
                WHERE transit_date BETWEEN
                    (SELECT MAX(transit_date) - 29 FROM `lively-encoder-448916-d5.nyc_subway.recent_dates_data`)
                    AND
                    (SELECT MAX(transit_date) FROM `lively-encoder-448916-d5.nyc_subway.recent_dates_data`)
            )
            SELECT
                transit_date + 1 AS next_transit_date,
                *
            FROM
                ridership_moving_avg
            WHERE transit_date = (SELECT MAX(transit_date) FROM `lively-encoder-448916-d5.nyc_subway.recent_dates_data`)
            ORDER BY transit_date DESC
            LIMIT 5000
            );
            """

In [66]:
# Step 4: Combine lags and moving averages to create input table for prediction
query_input = """
            CREATE OR REPLACE TABLE `lively-encoder-448916-d5.nyc_subway.daily_forecast_input`
            AS (
                SELECT
                    l.transit_date,
                    l.transit_mode_index,
                    l.station_complex_index,
                    l.borough_index,
                    l.payment_method_index,
                    EXTRACT(DAYOFWEEK FROM l.transit_date) AS day_of_week,
                    SIN(2 * ACOS(-1) * EXTRACT(DAYOFWEEK FROM l.transit_date) / 7) AS day_of_week_sin,
                    COS(2 * ACOS(-1) * EXTRACT(DAYOFWEEK FROM l.transit_date) / 7) AS day_of_week_cos,
                    CEIL(EXTRACT(DAY FROM l.transit_date) / 7) AS week_of_month,
                    SIN(2 * ACOS(-1) * CEIL(EXTRACT(DAY FROM l.transit_date) / 7) / 5) AS week_of_month_sin,
                    COS(2 * ACOS(-1) * CEIL(EXTRACT(DAY FROM l.transit_date) / 7) / 5) AS week_of_month_cos,
                    l.ridership_lag_1, l.ridership_lag_2, l.ridership_lag_3, l.ridership_lag_4, l.ridership_lag_5,
                    l.ridership_lag_6, l.ridership_lag_7, l.ridership_lag_8, l.ridership_lag_9, l.ridership_lag_10,
                    l.ridership_lag_11, l.ridership_lag_12, l.ridership_lag_13, l.ridership_lag_14, l.ridership_lag_15,
                    l.ridership_lag_16, l.ridership_lag_17, l.ridership_lag_18, l.ridership_lag_19, l.ridership_lag_20,
                    l.ridership_lag_21, l.ridership_lag_22, l.ridership_lag_23, l.ridership_lag_24, l.ridership_lag_25,
                    l.ridership_lag_26, l.ridership_lag_27, l.ridership_lag_28, l.ridership_lag_29, l.ridership_lag_30,
                    mv.ridership_7d_mv,
                    mv.day_of_week_7d_mv, mv.week_of_month_7d_mv,
                    mv.day_of_week_sin_7d_mv, mv.day_of_week_cos_7d_mv,
                    mv.week_of_month_sin_7d_mv, mv.week_of_month_cos_7d_mv,
                    mv.ridership_30d_mv,
                    mv.day_of_week_30d_mv, mv.week_of_month_30d_mv,
                    mv.day_of_week_sin_30d_mv, mv.day_of_week_cos_30d_mv,
                    mv.week_of_month_sin_30d_mv, mv.week_of_month_cos_30d_mv
                FROM `lively-encoder-448916-d5.nyc_subway.daily_forecast_lags` l
                INNER JOIN `lively-encoder-448916-d5.nyc_subway.daily_forecast_moving_avg` mv
                ON l.transit_mode_index = mv.transit_mode_index
                AND l.station_complex_index = mv.station_complex_index
                AND l.borough_index = mv.borough_index
                AND l.payment_method_index = mv.payment_method_index
            );
        """

In [67]:
# Step 6: Combine predictions (for next day's forecast) with input data (for features)
query_buffer = """
            CREATE OR REPLACE TABLE `lively-encoder-448916-d5.nyc_subway.forecast_buffer` AS
            SELECT
            -- Columns from daily_forecast_input
            i.transit_date,
            i.transit_mode_index,
            i.station_complex_index,
            i.borough_index,
            i.payment_method_index,
            -- Column from daily_forecast_output
            o.prediction,
            -- Other columns from daily_forecast_input
            i.day_of_week,
            i.day_of_week_sin,
            i.day_of_week_cos,
            i.week_of_month,
            i.week_of_month_sin,
            i.week_of_month_cos,
            i.ridership_lag_1,
            i.ridership_lag_2,
            i.ridership_lag_3,
            i.ridership_lag_4,
            i.ridership_lag_5,
            i.ridership_lag_6,
            i.ridership_lag_7,
            i.ridership_lag_8,
            i.ridership_lag_9,
            i.ridership_lag_10,
            i.ridership_lag_11,
            i.ridership_lag_12,
            i.ridership_lag_13,
            i.ridership_lag_14,
            i.ridership_lag_15,
            i.ridership_lag_16,
            i.ridership_lag_17,
            i.ridership_lag_18,
            i.ridership_lag_19,
            i.ridership_lag_20,
            i.ridership_lag_21,
            i.ridership_lag_22,
            i.ridership_lag_23,
            i.ridership_lag_24,
            i.ridership_lag_25,
            i.ridership_lag_26,
            i.ridership_lag_27,
            i.ridership_lag_28,
            i.ridership_lag_29,
            i.ridership_lag_30,
            i.ridership_7d_mv,
            i.day_of_week_7d_mv,
            i.week_of_month_7d_mv,
            i.day_of_week_sin_7d_mv,
            i.day_of_week_cos_7d_mv,
            i.week_of_month_sin_7d_mv,
            i.week_of_month_cos_7d_mv,
            i.ridership_30d_mv,
            i.day_of_week_30d_mv,
            i.week_of_month_30d_mv,
            i.day_of_week_sin_30d_mv,
            i.day_of_week_cos_30d_mv,
            i.week_of_month_sin_30d_mv,
            i.week_of_month_cos_30d_mv
            
            FROM `lively-encoder-448916-d5.nyc_subway.daily_forecast_input` AS i
            INNER JOIN `lively-encoder-448916-d5.nyc_subway.daily_forecast_output` AS o
            ON i.transit_date = o.transit_date
            AND i.transit_mode_index = o.transit_mode_index
            AND i.station_complex_index = o.station_complex_index
            AND i.borough_index = o.borough_index
            AND i.payment_method_index = o.payment_method_index;
            """

In [68]:
# Step 7: Join predictions with recent data
query_union = """
            CREATE OR REPLACE TABLE `lively-encoder-448916-d5.nyc_subway.recent_dates_data` AS
            SELECT * FROM `lively-encoder-448916-d5.nyc_subway.recent_dates_data`
            UNION ALL
            SELECT * FROM `lively-encoder-448916-d5.nyc_subway.forecast_buffer`;
            """

# -- -- Query to DROP DUPLICATES:
# -- CREATE OR REPLACE TABLE `lively-encoder-448916-d5.nyc_subway.recent_dates_data` AS
# -- SELECT DISTINCT *
# -- FROM `lively-encoder-448916-d5.nyc_subway.recent_dates_data`;

In [70]:
# Create recent_dates_df
client.query(query_recent_data).result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f099e4c8430>

In [71]:
for i in range(1, 15):
    # Add lags, mv, combine for input
    client.query(query_lags).result()
    client.query(query_ma).result()
    client.query(query_input).result()
    
    # Read input data from BigQuery table into a Spark DataFrame
    input_daily_df = spark.read \
        .format("bigquery") \
        .option("table", "lively-encoder-448916-d5.nyc_subway.daily_forecast_input") \
        .option("parentProject", "lively-encoder-448916-d5") \
        .load()

    # Extract the first value of the `transit_date` column
    df_transit_date = input_daily_df.select("transit_date").first()["transit_date"]

    # Drop the `transit_date` column from the DataFrame before inputting to the model
    input_daily_df = input_daily_df.drop("transit_date")

    # Get predictions
    output_predictions = model.transform(input_daily_df)
    output_predictions = output_predictions.withColumn("transit_date", to_date(lit(df_transit_date)))
    output_predictions = output_predictions.withColumn("prediction", col("prediction").cast(IntegerType()))
    output_predictions = output_predictions.select('transit_date', 'transit_mode_index', 'station_complex_index', 'borough_index', 'payment_method_index', 'prediction')

    # Write predictions
    output_predictions.write.format("bigquery") \
        .option("table", "lively-encoder-448916-d5.nyc_subway.daily_forecast_output") \
        .option("temporaryGcsBucket", "temp_nyc_bucket_for_bq") \
        .mode("overwrite") \
        .save()
    
    # Update recent_dates_df
    client.query(query_buffer).result()
    client.query(query_union).result()

                                                                                

In [72]:
query_forecast = """
                CREATE OR REPLACE TABLE `lively-encoder-448916-d5.nyc_subway.date_forecast` AS
                SELECT * FROM `lively-encoder-448916-d5.nyc_subway.recent_dates_data`
                WHERE transit_date > (SELECT MAX(transit_date) from `lively-encoder-448916-d5.nyc_subway.date_model`)
                ORDER BY transit_date DESC;
                """
client.query(query_forecast).result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f099da6ab60>

In [73]:
# Read forecasted data from BigQuery table into a Spark DataFrame
date_forecast = spark.read \
                    .format("bigquery") \
                    .option("table", "lively-encoder-448916-d5.nyc_subway.date_forecast") \
                    .option("parentProject", "lively-encoder-448916-d5") \
                    .load()

In [77]:
from pyspark.sql.functions import col, lit, create_map, round, to_date

# Map encoded categorical columns to their original values
payment_method_mapping = {1: 'metrocard', 2: 'omny'}
transit_mode_mapping = {1: 'staten_island_railway', 2: 'subway', 3: 'tram'}
borough_mapping = {1: 'Bronx', 2: 'Brooklyn', 3: 'Manhattan', 4: 'Queens', 5: 'Staten Island'}

station_df = spark.read.csv("gs://bucket_jars/station_complex_csv/station_complex.csv", header=True)
station_df = station_df.withColumn("station_complex_index", col("station_complex_index").cast("int"))
station_df = station_df.select("station_complex_index", "station_complex")
# station_df.printSchema()

station_map = station_df.rdd.collectAsMap()
# print(station_mapping)
station_mapping = [
    item for pair in station_map.items() for item in (lit(pair[0]), lit(pair[1]))
]

# Create mapping expressions
payment_method_expr = create_map([lit(k) for pair in payment_method_mapping.items() for k in pair])
transit_mode_expr = create_map([lit(k) for pair in transit_mode_mapping.items() for k in pair])
borough_expr = create_map([lit(k) for pair in borough_mapping.items() for k in pair])
station_complex_expr = create_map(*station_mapping)

# Apply the mappings
dates_forecasted = date_forecast.withColumn(
                        "payment_method", 
                        payment_method_expr[col("payment_method_index").cast("int")]
                    ).withColumn(
                        "transit_mode", 
                        transit_mode_expr[col("transit_mode_index").cast("int")]
                    ).withColumn(
                        "borough", 
                        borough_expr[col("borough_index").cast("int")]
                    ).withColumn(
                        "station_complex", 
                        station_complex_expr[col("station_complex_index").cast("int")]
                    )

In [78]:
dates_forecasted = date_forecasted.select('transit_date', 'transit_mode', 'station_complex', 'borough', 'payment_method', 'prediction')
# date_forecasted = date_forecasted.withColumn("prediction", round("prediction", 0))
# date_forecasted = date_forecasted.withColumn("prediction", col("prediction").cast("integer"))
# date_forecasted.show(5, truncate=False)

In [79]:
# Write forecast
dates_forecasted.write.format("bigquery") \
    .option("table", "lively-encoder-448916-d5.nyc_subway.date_forecast") \
    .option("temporaryGcsBucket", "temp_nyc_bucket_for_bq") \
    .mode("overwrite") \
    .save()

                                                                                