In [1]:
!pip install ipython-autotime



In [2]:
%load_ext autotime

time: 73 μs (started: 2025-03-09 12:53:49 +00:00)


# Data enrichment

* Join prickup and dropoff coordinates with location name
* Convert datetimes to timestamps

In [3]:
from pyspark.sql import SparkSession
from sedona.utils import SedonaKryoRegistrator
from pyspark.sql.functions import col, udf, unix_timestamp, to_timestamp
from pyspark.sql.functions import expr
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

import geopandas as gpd
from shapely import Point

from pathlib import Path

time: 544 ms (started: 2025-03-09 12:53:49 +00:00)


In [4]:
sedona_jar = '/home/jovyan/jars/sedona-spark-shaded-3.0_2.12-1.6.1.jar'
geotools_jar = '/home/jovyan/jars/geotools-wrapper-1.7.0-28.5.jar'
spark = (
    SparkSession.builder
        .config("spark.jars", f"{sedona_jar},{geotools_jar}") 
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        .config("spark.kryo.registrator", SedonaKryoRegistrator.getName)
        .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions")
        .config("spark.driver.memory", "8g")
        .config("spark.executor.memory", "8g")
        .appName('NYC Taxi')    
        .getOrCreate()
)

time: 2.39 s (started: 2025-03-09 12:53:50 +00:00)


In [5]:
data_ready_path = './output/data_ready.parquet'
data_ready_file = Path(data_ready_path)

time: 367 μs (started: 2025-03-09 12:53:52 +00:00)


In [6]:
if not data_ready_file.exists():
    # Introduce schema to maintain data integrity
    schema = StructType([
        StructField("medallion", StringType(), True),
        StructField("hack_license", StringType(), True),
        StructField("vendor_id", StringType(), True),
        StructField("rate_code", IntegerType(), True),
        StructField("store_and_fwd_flag", StringType(), True),
        StructField("pickup_datetime", StringType(), True),
        StructField("dropoff_datetime", StringType(), True),
        StructField("passenger_count", IntegerType(), True),
    
        StructField("trip_time_in_secs", StringType(), True),
        StructField("trip_distance", StringType(), True),
        
        StructField("pickup_longitude", StringType(), True),
        StructField("pickup_latitude", StringType(), True),
        StructField("dropoff_longitude", StringType(), True),
        StructField("dropoff_latitude", StringType(), True)
    ])
    
    data_path = './output/data.parquet'
    data_file = Path(data_path)
    if not data_file.exists():
        csv_data = spark.read.csv('./data/trip_data/*.csv', header=True, schema=schema)
        cols = ["medallion", "pickup_datetime", "dropoff_datetime", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]
        csv_data_cols = csv_data.selectExpr(cols)
        csv_data_cleaned = csv_data_cols.dropna(subset=cols) # Remove rows with NULL cols (columns with interest)
        csv_data_cleaned.write.mode('overwrite').parquet(data_path)
    
    df_trip = spark.read.load(data_path)


time: 1min 56s (started: 2025-03-09 12:53:52 +00:00)


In [7]:
# Convert pcikup and dropoff time to timestamps

if not data_ready_file.exists():
    DATE_FORMAT = 'yyyy-MM-d HH:mm:ss'
    df_trip_w_ts = (df_trip
             .withColumn('pickup_ts', unix_timestamp(to_timestamp(col("pickup_datetime"), DATE_FORMAT)))
             .withColumn('dropoff_ts', unix_timestamp(to_timestamp(col("dropoff_datetime"), DATE_FORMAT)))
            )

time: 52.7 ms (started: 2025-03-09 12:55:48 +00:00)


In [8]:
# Load spatial data of NY

if not data_ready_file.exists():
    # First we use geopandas to read the geojson
    gdf = gpd.read_file('./data/nyc-boroughs.geojson')
    
    # Convert geom to WKT
    gdf['geom'] = gdf['geometry'].apply(lambda geom: geom.wkt if geom else None)
    
    # geopandas df to pandas df
    pdf = gdf.astype(str)
    
    # pandas df to spark df
    df_geom = spark.createDataFrame(pdf)
    
    # Convert WKT to geom (format) which is used for intersections
    df_geom = df_geom.withColumn("geom", expr("ST_GeomFromWKT(geom)"))

time: 407 ms (started: 2025-03-09 12:55:48 +00:00)


In [9]:
if not data_ready_file.exists():
    # Helper function to convert long and lat to Point
    udf_to_point = udf(lambda lon, lat: Point(lon, lat).wkt if lon is not None and lat is not None else '')
    
    # Trip df to WKT -> geom 
    df_trip_w_points = (df_trip_w_ts
         .withColumn('pickup_point', udf_to_point(col('pickup_longitude'), col('pickup_latitude')))
         .withColumn('dropoff_point', udf_to_point(col('dropoff_longitude'), col('dropoff_latitude')))
         .withColumn('geom_pickup', expr('ST_GeomFromWKT(pickup_point)'))
         .withColumn('geom_dropoff', expr('ST_GeomFromWKT(dropoff_point)'))
        )
    
    df_geom = df_geom.repartition(40)
    df_trip_w_points = df_trip_w_points.repartition(40)
    
    # Join geospatial data to trip pickup locations
    df_trip_w_pickup = df_geom.alias('geo').join(
        df_trip_w_points.alias('travel'),
        expr('ST_Intersects(geo.geom, travel.geom_pickup)'),
        'inner'
    )
    
    # Distinguish pickup borough
    df_trip_w_pickup = df_trip_w_pickup.withColumn('pickup_borough', col('borough'))
    
    # Remove excess columns from pickup
    df_trip_w_pickup = df_trip_w_pickup.drop("geometry", "@id", "geom", "borough", "boroughCode")
    
    # Join geospatial data to trip dropoff locations
    df_trip_w_pickup_n_dropoff = df_geom.alias('geo').join(
        df_trip_w_pickup.alias('travel'),
        expr('ST_Intersects(geo.geom, travel.geom_dropoff)'),
        'inner'
    )
    
    # Distinguish dropoff borough
    df_trip_w_pickup_n_dropoff = df_trip_w_pickup_n_dropoff.withColumn('dropoff_borough', col('borough'))
    
    df_final = df_trip_w_pickup_n_dropoff.select("medallion", "pickup_borough", "dropoff_borough", "pickup_ts", "dropoff_ts")
    df_final.write.mode('overwrite').parquet(data_ready_path)

time: 22min (started: 2025-03-09 12:55:49 +00:00)


In [10]:
df = spark.read.load(data_ready_path)

time: 195 ms (started: 2025-03-09 13:17:50 +00:00)


In [11]:
df.first()

Row(medallion='CB0CF7051AAD729BD1F030EC42E3AAFB', pickup_borough='Brooklyn', dropoff_borough='Brooklyn', pickup_ts=1379879890, dropoff_ts=1379880453)

time: 638 ms (started: 2025-03-09 13:17:50 +00:00)


# Queries

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, unix_timestamp, lag, sum, avg
from pyspark.sql.window import Window

time: 5.36 ms (started: 2025-03-09 13:17:50 +00:00)


In [13]:
# Preprocessing for query 1 and 2

# Order by prickup_ts to get sequential trips
# Group by medallion (per taxi)
window = Window.partitionBy('medallion').orderBy('pickup_ts')

df_ts = (df
         .withColumn('dropoff_ts_prev', lag('dropoff_ts').over(window)) # lag() returns previous window row value*
         .withColumn('idle_time', col('pickup_ts') - col('dropoff_ts_prev'))
         .withColumn('trip_time', col('dropoff_ts') - col('pickup_ts'))
        )

# Remove neg idle_time and idle_time > 4h
HOURS_4 = 4 * 60 * 60 # 14400 seconds
df_ts_wo_exceptions = df_ts.filter((col('idle_time') > 0) & (col('idle_time') <= 14400))

time: 120 ms (started: 2025-03-09 13:17:50 +00:00)


## Query 1

In [14]:
# Group by taxi (using medallion col)
df_medal = df_ts_wo_exceptions.groupBy('medallion').agg(
    avg(col('trip_time')).alias('total_trip_time'),
    avg(col('idle_time')).alias('total_idle_time')
)

# Calculate utilization col
df_util = df_medal.withColumn(
    'utilization', col('total_trip_time') / (col('total_trip_time') + col('total_idle_time'))
)

time: 56.3 ms (started: 2025-03-09 13:17:51 +00:00)


In [15]:
df_util.show()

+--------------------+-----------------+------------------+-------------------+
|           medallion|  total_trip_time|   total_idle_time|        utilization|
+--------------------+-----------------+------------------+-------------------+
|01F24976B8E3FF46A...|847.6214767764986|472.89440254069075|  0.641886621776018|
|025B4E80E8A06FDB0...|812.2154628496091|    919.2627366042| 0.4690878944394568|
|026B27179DE85CFDC...|  716.52846934071| 800.9352236053481| 0.4721882129183705|
|03F2B14F5C52CC104...|760.6465364120781| 993.3090586145648|0.43367491090931737|
|0545C6C1D0CDD7406...|717.4148034363088| 809.4505888376856| 0.4698611986796341|
|0646CC81D58EFCE89...|711.4439345160628| 1014.493321460374|0.41220729899220376|
|065BED6566F2BB3F0...|761.9670631290028| 847.5663311985361| 0.4734086697513671|
|06EAD4C8D98202F1E...|689.3595683716393| 911.5951350362739|0.43059279997380084|
|08E9F5633328D780C...|705.3899524375743| 695.5702140309156| 0.5035046458285148|
|0A7F5AE428801886F...|792.6368642973927|

## Query 2

In [16]:
df_time_to_next = df_ts_wo_exceptions.groupBy('dropoff_borough').agg(
    avg(col('idle_time')).alias('avg_time_for_next_ride')
)

time: 13.9 ms (started: 2025-03-09 13:18:17 +00:00)


In [17]:
df_time_to_next.show()

+---------------+----------------------+
|dropoff_borough|avg_time_for_next_ride|
+---------------+----------------------+
|         Queens|     1604.036819771684|
|       Brooklyn|     1187.756228123166|
|  Staten Island|    1666.4436438497169|
|      Manhattan|     823.5725760747847|
|          Bronx|    1381.9571459888841|
+---------------+----------------------+

time: 41.7 s (started: 2025-03-09 13:18:17 +00:00)


## Query 3

In [18]:
c_trips_end_start_loc_same = df.where(
    (col("pickup_borough").isNotNull()) & 
    (col("dropoff_borough").isNotNull()) & 
    (col("pickup_borough") == col("dropoff_borough"))
).count()

time: 1.22 s (started: 2025-03-09 13:18:58 +00:00)


In [19]:
c_trips_end_start_loc_same

148705597

time: 2.78 ms (started: 2025-03-09 13:19:00 +00:00)


## Query 4

In [20]:
c_trips_end_start_loc_not_same = df.where(
    (col("pickup_borough").isNotNull()) & 
    (col("dropoff_borough").isNotNull()) & 
    (col("pickup_borough") != col("dropoff_borough"))
).count()

time: 1.13 s (started: 2025-03-09 13:19:00 +00:00)


In [21]:
c_trips_end_start_loc_not_same

20149407

time: 2.96 ms (started: 2025-03-09 13:19:01 +00:00)


## Miscellaneous

Tests should render True!

In [22]:
is_count_eq = df.count() == c_trips_end_start_loc_same + c_trips_end_start_loc_not_same
print(f"Q3+Q4 equals total sum? {is_count_eq}")

Q3+Q4 equals total sum? True
time: 207 ms (started: 2025-03-09 13:19:01 +00:00)
