In [None]:
!pip install kagglehub pyspark xgboost findspark kagglehub pyspark==3.5.0 xgboost findspark

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
import os
import time
import kagglehub
import pandas as pd
import numpy as np
from pyspark.sql import functions as F
from pyspark.sql.functions import col, to_timestamp, when
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType, StructField, IntegerType,
    DoubleType, StringType, TimestampType
)
from pyspark.ml.feature import VectorAssembler, StandardScaler
from xgboost.spark import SparkXGBRegressor

In [None]:
local_path = kagglehub.dataset_download("elemento/nyc-yellow-taxi-trip-data")
print("Local dataset path:", local_path)
csv_glob = "/root/.cache/kagglehub/datasets/elemento/nyc-yellow-taxi-trip-data/versions/2/*.csv"

Local dataset path: /root/.cache/kagglehub/datasets/elemento/nyc-yellow-taxi-trip-data/versions/2


In [None]:
schema = StructType([
    StructField("VendorID", IntegerType(), True),
    StructField("tpep_pickup_datetime", TimestampType(), True),
    StructField("tpep_dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("RateCodeID", IntegerType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("payment_type", IntegerType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True)
])

config = (SparkSession.builder
         .appName("NYC Taxi Price Prediction")
         .config("spark.driver.memory", "120g")
         .config("spark.task.cpus", "4")
         .config("spark.cores", "4")
         .config("spark.sql.shuffle.partitions", "200")
         )

spark = config.getOrCreate()
df = spark.read.option("header", True).schema(schema).option("escape", "\"").csv(csv_glob)

In [None]:
def process_data(df):
    df = df.filter((col("trip_distance") != 0) & (col("trip_distance").isNotNull()))
    df = df.filter(col("tpep_pickup_datetime").isNotNull() & col("tpep_dropoff_datetime").isNotNull())


    trip_distance_quantiles = df.approxQuantile("trip_distance", [0.01, 0.99], 0.01)
    fare_amount_quantiles = df.approxQuantile("fare_amount", [0.01, 0.99], 0.01)
    df = df.filter(
        (col("trip_distance") >= trip_distance_quantiles[0])
        & (col("trip_distance") <= trip_distance_quantiles[1])
        & (col("fare_amount") >= fare_amount_quantiles[0])
        & (col("fare_amount") <= fare_amount_quantiles[1])
    )

    df = df.withColumn("tpep_pickup_datetime", to_timestamp(col("tpep_pickup_datetime")))
    df = df.withColumn("tpep_dropoff_datetime", to_timestamp(col("tpep_dropoff_datetime")))
    df = df.withColumn("pickup_hour", F.hour(col("tpep_pickup_datetime")))
    df = df.withColumn("day_of_week", F.dayofweek(col("tpep_pickup_datetime")))
    df = df.withColumn("is_weekend", when(col("day_of_week").isin(1, 7), 1).otherwise(0))

    df = df.withColumn("trip_duration", (F.unix_timestamp("tpep_dropoff_datetime") - F.unix_timestamp("tpep_pickup_datetime")) / 60.0)

    df = df.filter(col("fare_amount") > 0)

    df = df.dropna(subset=["pickup_hour", "is_weekend", "trip_duration", "fare_amount", "trip_distance"])

    #df = df.filter((col("pickup_hour") != 0))


    return df

df = process_data(df)
df.show(5)


+--------+--------------------+---------------------+---------------+-------------+------------------+------------------+----------+------------------+------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+-----------+-----------+----------+------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|  pickup_longitude|   pickup_latitude|RateCodeID|store_and_fwd_flag| dropoff_longitude|  dropoff_latitude|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|pickup_hour|day_of_week|is_weekend|     trip_duration|
+--------+--------------------+---------------------+---------------+-------------+------------------+------------------+----------+------------------+------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+-----------+-----------+-----

In [None]:
df2 = df.sample(fraction=0.1)

In [None]:
assembler = VectorAssembler(
    inputCols=["pickup_hour", "passenger_count", "is_weekend", "trip_duration", "trip_distance"],
    outputCol="features"
)
df_ml = assembler.transform(df2).select("features", "fare_amount")

train_data, test_data = df_ml.randomSplit([0.8, 0.2], seed=42)
feature_names = assembler.getInputCols()

In [None]:
train_data.show(5)

+--------------------+-----------+
|            features|fare_amount|
+--------------------+-----------+
|[1.0,1.0,0.0,0.56...|        3.0|
|[1.0,1.0,0.0,1.53...|        3.5|
|[1.0,1.0,0.0,2.03...|        4.5|
|[1.0,1.0,0.0,2.21...|        4.0|
|[1.0,1.0,0.0,2.3,...|        4.0|
+--------------------+-----------+
only showing top 5 rows



In [None]:
xgb_regressor = SparkXGBRegressor(
    label_col="fare_amount",
    features_col="features",
    maxDepth=6,
    n_estimators=100,
    learningRate=0.1,
    evalMetric="rmse"
)
start_time = time.time()
model = xgb_regressor.fit(train_data)
fit_time = time.time() - start_time
print(f"[Timing] XGBoost model fit: {fit_time:.2f} seconds")


In [None]:
predict_df = model.transform(test_data)
predict_df.show()

+--------------------+-----------+-----------------+
|            features|fare_amount|       prediction|
+--------------------+-----------+-----------------+
|[1.0,1.0,0.0,1.4,...|        3.5|3.745875120162964|
|[1.0,1.0,0.0,1.63...|        4.0|    4.19287109375|
|[1.0,1.0,0.0,1.71...|        3.5|3.874260187149048|
|[1.0,1.0,0.0,2.56...|        5.0|5.219765663146973|
|[1.0,1.0,0.0,3.13...|        4.0|4.094642639160156|
|[1.0,1.0,0.0,3.31...|        5.0|5.251624584197998|
|[1.0,1.0,0.0,3.46...|        4.5|4.577670574188232|
|[1.0,1.0,0.0,3.8,...|        5.5|5.130946159362793|
|[1.0,1.0,0.0,4.08...|        5.5|5.364330291748047|
|[1.0,1.0,0.0,4.16...|        5.5|5.407935619354248|
|[1.0,1.0,0.0,4.3,...|        5.5|5.320507526397705|
|[1.0,1.0,0.0,4.43...|        5.5|5.349546909332275|
|[1.0,1.0,0.0,4.58...|        7.5|7.386869430541992|
|[1.0,1.0,0.0,4.71...|        6.5|6.100586414337158|
|[1.0,1.0,0.0,5.13...|        7.0|6.794867515563965|
|[1.0,1.0,0.0,5.33...|        6.0|6.2954306602

In [None]:
predictions = model.transform(test_data)

evaluator_rmse = RegressionEvaluator(
    labelCol="fare_amount", predictionCol="prediction", metricName="rmse"
)
rmse = evaluator_rmse.evaluate(predictions)
print(f"Test RMSE: {rmse:.4f}")

evaluator_mae = RegressionEvaluator(
    labelCol="fare_amount", predictionCol="prediction", metricName="mae"
)
mae = evaluator_mae.evaluate(predictions)
print(f"Test MAE: {mae:.4f}")

evaluator_r2 = RegressionEvaluator(
    labelCol="fare_amount", predictionCol="prediction", metricName="r2"
)
r2 = evaluator_r2.evaluate(predictions)
print(f"Test R-squared: {r2:.4f}")

predictions.select("fare_amount", "prediction", "features").show(12, truncate=False)