# Daily ridership prediction

In [1]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/thehari08/Work/surya/surya_previous/lively-encoder-448916-d5-5e9819f4df4e.json"

from datetime import datetime

from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql import Window
from pyspark.sql.functions import col, count, countDistinct, sum, min, max, avg, stddev, hour, dayofweek, dayofmonth, ceil, sin, cos, lit, pi, lag, unix_timestamp, create_map
from pyspark.sql.types import (
    StructType, StructField, StringType, LongType, DoubleType, ArrayType, TimestampType, FloatType
)

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

import sklearn
import xgboost as xgb
from xgboost.spark import SparkXGBRegressor
from pyspark.ml.regression import RandomForestRegressor

### Initialize Spark session and import data from BigQuery

In [30]:
# Stop if existing Spark session is running
if "spark" in locals():
    spark.stop()

In [3]:
# Initialize Spark session with GCS configuration
spark = SparkSession.builder \
    .appName("NYC Subway Data Processing") \
    .config("spark.jars", 
            "/home/thehari08/Work/surya/surya_previous/gcs-connector-hadoop3-latest.jar, /home/thehari08/Work/surya/surya_previous/spark-bigquery-with-dependencies_2.12-0.28.0.jar, https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark_2.13/2.1.4/xgboost4j-spark_2.13-2.1.4.jar") \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", 
            "/home/thehari08/Work/surya/surya_previous/lively-encoder-448916-d5-5e9819f4df4e.json") \
    .master("local[24]") \
    .config("spark.driver.memory", "48g") \
    .config("spark.executor.memory", "48g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.executor.cores", "6") \
    .config("spark.task.cpus", "1") \
    .config("spark.default.parallelism", "56") \
    .config("spark.sql.shuffle.partitions", "56") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.rdd.compress", "true") \
    .config("spark.locality.wait", "0") \
    .config("spark.sql.parquet.int96RebaseModeInRead", "CORRECTED") \
    .config("spark.sql.parquet.datetimeRebaseModeInRead", "CORRECTED") \
    .config("spark.sql.parquet.enableVectorizedReader", "false") \
    .config("spark.sql.execution.arrow.enabled", "false") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "false") \
    .config("spark.sql.execution.arrow.fallback.enabled", "false") \
    .config("spark.local.dir", "/home/thehari08/Work/spark") \
    .getOrCreate()

# Suppress warnings, show only errors
spark.sparkContext.setLogLevel("ERROR")

25/04/18 07:23:26 WARN Utils: Your hostname, theBEAST resolves to a loopback address: 127.0.1.1; using 10.20.30.10 instead (on interface enp4s0)
25/04/18 07:23:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/04/18 07:23:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/18 07:23:28 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [4]:
# Import data from BigQuery
gcs_bucket_path = "gs://date_pred_data/*.parquet"
df = spark.read.parquet(gcs_bucket_path)

df.show(1, truncate=False)
df.count()

                                                                                

+------------+------------------+---------------------+-------------+--------------------+---------+--------+-----------+-----------------------+---------------+-------------+------------------+-------------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+---------------+-----------------+-------------------+---------------------+---------------------+-----------------------+-----------------------+----------------+------------------+--------------------+----------------------+----------------------+------------------------+-----------

                                                                                

1355175

## Modeling

In [5]:
# Using 2020-2023 as train data and 2024 as test data to preserve order
train_data = df.filter(col("transit_date") < "2024-01-01")
test_data = df.filter(col("transit_date") >= "2024-01-01")

In [6]:
# Feature Engineering
feature_columns = [col for col in df.columns if col not in {"transit_date", "ridership", "transfer"}]
print("Feature columns:", feature_columns)

Feature columns: ['transit_mode_index', 'station_complex_index', 'borough_index', 'payment_method_index', 'day_of_week', 'day_of_week_sin', 'day_of_week_cos', 'week_of_month', 'week_of_month_sin', 'week_of_month_cos', 'ridership_lag_1', 'ridership_lag_2', 'ridership_lag_3', 'ridership_lag_4', 'ridership_lag_5', 'ridership_lag_6', 'ridership_lag_7', 'ridership_lag_8', 'ridership_lag_9', 'ridership_lag_10', 'ridership_lag_11', 'ridership_lag_12', 'ridership_lag_13', 'ridership_lag_14', 'ridership_lag_15', 'ridership_lag_16', 'ridership_lag_17', 'ridership_lag_18', 'ridership_lag_19', 'ridership_lag_20', 'ridership_lag_21', 'ridership_lag_22', 'ridership_lag_23', 'ridership_lag_24', 'ridership_lag_25', 'ridership_lag_26', 'ridership_lag_27', 'ridership_lag_28', 'ridership_lag_29', 'ridership_lag_30', 'ridership_7d_mv', 'day_of_week_7d_mv', 'week_of_month_7d_mv', 'day_of_week_sin_7d_mv', 'day_of_week_cos_7d_mv', 'week_of_month_sin_7d_mv', 'week_of_month_cos_7d_mv', 'ridership_30d_mv', 'day

### XGBoost model

In [7]:
# XGB
# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Train the XGBoost Model
xgb_regressor = SparkXGBRegressor(
    features_col="features",
    label_col="ridership",    
    num_workers=8,        # Adjusted based on cluster configuration
    max_depth=10,             
    learning_rate=0.1,         
)

# Create pipeline to chain feature assembly and model training
xgb_pipeline = Pipeline(stages=[assembler, xgb_regressor])

# Fit the model directly (without cross-validation)
xgb_model = xgb_pipeline.fit(train_data)

# Make Predictions
xgb_predictions = xgb_model.transform(test_data)

2025-04-18 07:24:18,786 INFO XGBoost-PySpark: _fit Running xgboost-3.0.0 on 8 workers with
	booster params: {'objective': 'reg:squarederror', 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 10, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}


2025-04-18 07:24:43,522 INFO XGBoost-PySpark: _train_booster Training on CPUs 8]
[07:24:44] Task 0 got rank 0
[07:24:44] Task 2 got rank 2
[07:24:44] Task 4 got rank 4
[07:24:44] Task 3 got rank 3
[07:24:44] Task 5 got rank 5
[07:24:44] Task 6 got rank 6
[07:24:44] Task 7 got rank 7
[07:24:44] Task 1 got rank 1
[07:24:45] [0]	training-rmse:4333.35137
[07:24:45] [1]	training-rmse:3913.47675
[07:24:45] [2]	training-rmse:3536.30481
[07:24:46] [3]	training-rmse:3197.60639
[07:24:46] [4]	training-rmse:2893.57917
[07:24:46] [5]	training-rmse:2620.90182
[07:24:46] [6]	training-rmse:2376.10225
[07:24:47] [7]	training-rmse:2156.85164
[07:24:47] [8]	training-rmse:1960.96658
[07:24:47] [9]	training-rmse:1785.62963
[07:24:47] [10]	training-rmse:1629.14021
[07:24:47] [11]	training-rmse:1489.84261
[07:24:48] [12]	training-rmse:1365.31872
[07:24:48] [13]	training-rmse:1254.68255
[07:24:48] [14]	training-rmse:1156.94562
[07:24:48] [15]	training-rmse:1069.95641
[07:24:48] [16]	training-rmse:993.51860
[

In [8]:
# XGB metrics
xgb_metrics = {
    'rmse': 'Root Mean Squared Error',
    'mse': 'Mean Squared Error',
    'mae': 'Mean Absolute Error'
}

for metric, name in xgb_metrics.items():
    xgb_evaluator = RegressionEvaluator(
        labelCol="ridership",
        predictionCol="prediction",
        metricName=metric)
    value = xgb_evaluator.evaluate(xgb_predictions)
    print(f"{name} ({metric.upper()}): {value}")

# Root Mean Squared Error (RMSE): 1271.62
# Mean Squared Error (MSE): 1617037.06
# Mean Absolute Error (MAE): 366.85

2025-04-18 07:25:18,870 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                

Root Mean Squared Error (RMSE): 1271.6277225599736


2025-04-18 07:25:42,044 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                

Mean Squared Error (MSE): 1617037.064783065


2025-04-18 07:26:03,819 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs

Mean Absolute Error (MAE): 366.85428671978406


                                                                                

### Save model in cloud

In [None]:
# Save the trained XGBoost model to GCS
gcs_model_path = "gs://model_stored_for_pred_forecast/daily_prediction"
xgb_model.write().overwrite().save(gcs_model_path)

### Random Forest model

In [12]:
# RF model
# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Train the Random Forest Model
rf_regressor = RandomForestRegressor(
    featuresCol="features",
    labelCol="ridership",
    numTrees=100,                 # Number of trees in the forest
    maxDepth=10,                  # Maximum depth of each tree
    featureSubsetStrategy="auto", # Number of features to consider for splits
    subsamplingRate=1.0,          # Fraction of training data used for each tree
    seed=42                       
)

# Create a pipeline to chain feature assembly and model training
rf_pipeline = Pipeline(stages=[assembler, rf_regressor])

# Fit the model
rf_model = rf_pipeline.fit(train_data)

# Make Predictions
rf_predictions = rf_model.transform(test_data)



                                                                                

In [13]:
# RF metrics
rf_metrics = {
    'rmse': 'Root Mean Squared Error',
    'mse': 'Mean Squared Error',
    'mae': 'Mean Absolute Error'
}

for metric, name in rf_metrics.items():
    rf_evaluator = RegressionEvaluator(
        labelCol="ridership",
        predictionCol="prediction",
        metricName=metric)
    value = rf_evaluator.evaluate(rf_predictions)
    print(f"{name} ({metric.upper()}): {value}")

# Root Mean Squared Error (RMSE): 2269.51
# Mean Squared Error (MSE): 5150680.74
# Mean Absolute Error (MAE): 551.23

                                                                                

Root Mean Squared Error (RMSE): 2269.5111255898855


[Stage 44:>                                                       (0 + 24) / 38]

                                                                                

Mean Squared Error (MSE): 5150680.749176269




Mean Absolute Error (MAE): 551.2335129139828


                                                                                

### Storing best performing model's predictions to BigQuery

In [14]:
# Cast transit_date to DateType
xgb_predictions = xgb_predictions.withColumn("transit_date", col("transit_date").cast("date"))
xgb_predictions.show(5, truncate=False)

[Stage 48:>                                                         (0 + 1) / 1]

+------------+------------------+---------------------+-------------+--------------------+---------+--------+-----------+-----------------------+-------------------+-------------+-----------------------+-------------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+------------------+------------------+-------------------+-----------------------+-----------------------+-----------------------+-----------------------+------------------+------------------+--------------------+----------------------+----------------------+-----------------

2025-04-18 07:30:06,111 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                

In [15]:
# Filter relevant columns
xgb_date_pred = xgb_predictions.select("transit_date", "transit_mode_index", "borough_index", "station_complex_index", "payment_method_index", "ridership", "transfer", "prediction")
xgb_date_pred.show(5, truncate=False)

[Stage 49:>                                                         (0 + 1) / 1]

+------------+------------------+-------------+---------------------+--------------------+---------+--------+-----------------+
|transit_date|transit_mode_index|borough_index|station_complex_index|payment_method_index|ridership|transfer|prediction       |
+------------+------------------+-------------+---------------------+--------------------+---------+--------+-----------------+
|2024-06-08  |2                 |2            |346                  |2                   |287      |7       |267.2445983886719|
|2024-05-29  |2                 |2            |346                  |2                   |444      |13      |440.7408142089844|
|2024-01-16  |2                 |3            |109                  |2                   |2628     |33      |2289.254638671875|
|2024-10-20  |2                 |4            |146                  |2                   |415      |1       |521.6731567382812|
|2024-10-27  |2                 |2            |286                  |2                   |941      |14  

2025-04-18 07:30:19,714 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                

In [16]:
# Map encoded categorical columns to their original values
payment_method_mapping = {1: 'metrocard', 2: 'omny'}
transit_mode_mapping = {1: 'staten_island_railway', 2: 'subway', 3: 'tram'}
borough_mapping = {1: 'Bronx', 2: 'Brooklyn', 3: 'Manhattan', 4: 'Queens', 5: 'Staten Island'}

station_df = spark.read.csv("station_complex.csv", header=True)
station_df = station_df.withColumn("station_complex_index", col("station_complex_index").cast("int"))
station_df = station_df.select("station_complex_index", "station_complex")
# station_df.printSchema()

station_map = station_df.rdd.collectAsMap()
# print(station_mapping)
station_mapping = [
    item for pair in station_map.items() for item in (lit(pair[0]), lit(pair[1]))
]

# Create mapping expressions
payment_method_expr = create_map([lit(k) for pair in payment_method_mapping.items() for k in pair])
transit_mode_expr = create_map([lit(k) for pair in transit_mode_mapping.items() for k in pair])
borough_expr = create_map([lit(k) for pair in borough_mapping.items() for k in pair])
station_complex_expr = create_map(*station_mapping)

# Apply the mappings
xgb_date_pred_mapped = xgb_date_pred.withColumn(
    "payment_method", 
    payment_method_expr[col("payment_method_index").cast("int")]
).withColumn(
    "transit_mode", 
    transit_mode_expr[col("transit_mode_index").cast("int")]
).withColumn(
    "borough", 
    borough_expr[col("borough_index").cast("int")]
).withColumn(
    "station_complex", 
    station_complex_expr[col("station_complex_index").cast("int")]
)

xgb_date_pred_mapped.show(5, truncate=False)

[Stage 52:>                                                         (0 + 1) / 1]

+------------+------------------+-------------+---------------------+--------------------+---------+--------+-----------------+--------------+------------+---------+---------------+
|transit_date|transit_mode_index|borough_index|station_complex_index|payment_method_index|ridership|transfer|prediction       |payment_method|transit_mode|borough  |station_complex|
+------------+------------------+-------------+---------------------+--------------------+---------+--------+-----------------+--------------+------------+---------+---------------+
|2024-06-08  |2                 |2            |346                  |2                   |287      |7       |267.2445983886719|omny          |subway      |Brooklyn |Neptune Av (F) |
|2024-05-29  |2                 |2            |346                  |2                   |444      |13      |440.7408142089844|omny          |subway      |Brooklyn |Neptune Av (F) |
|2024-01-16  |2                 |3            |109                  |2                   |

2025-04-18 07:30:33,353 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                

In [17]:
# Filter relevant columns
xgb_date_pred_bq = xgb_date_pred_mapped.select('transit_date', 'transit_mode', 'borough', 'station_complex', 'payment_method', 'ridership', 'transfer', 'prediction')
xgb_date_pred_bq.show(5)

[Stage 53:>                                                         (0 + 1) / 1]

+------------+------------+---------+---------------+--------------+---------+--------+-----------------+
|transit_date|transit_mode|  borough|station_complex|payment_method|ridership|transfer|       prediction|
+------------+------------+---------+---------------+--------------+---------+--------+-----------------+
|  2024-06-08|      subway| Brooklyn| Neptune Av (F)|          omny|      287|       7|267.2445983886719|
|  2024-05-29|      subway| Brooklyn| Neptune Av (F)|          omny|      444|      13|440.7408142089844|
|  2024-01-16|      subway|Manhattan|      57 St (F)|          omny|     2628|      33|2289.254638671875|
|  2024-10-20|      subway|   Queens|      88 St (A)|          omny|      415|       1|521.6731567382812|
|  2024-10-27|      subway| Brooklyn|   Grant Av (A)|          omny|      941|      14| 978.265869140625|
+------------+------------+---------+---------------+--------------+---------+--------+-----------------+
only showing top 5 rows



2025-04-18 07:30:46,771 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                

In [None]:
# Write predictions to BigQuery
xgb_date_pred_bq.write.format("bigquery") \
    .option("table", "lively-encoder-448916-d5.nyc_subway.daily_predictions") \
    .option("temporaryGcsBucket", "temp_nyc_bucket_for_bq") \
    .mode("overwrite") \
    .save()

2025-04-18 07:31:05,476 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs


                                                                                