In [7]:
!pip install geopandas
!pip install pyspark



In [8]:
import json
from datetime import datetime

# PySpark imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, lag, lead, when, lit, expr,
    sum as _sum, avg as _avg
)
from pyspark.sql.types import StringType, TimestampType
from pyspark.sql.window import Window

# Shapely imports
from shapely.geometry import shape, Point

# 1) Start Spark Session
spark = SparkSession.builder \
    .appName("NYC Taxi Analysis") \
    .getOrCreate()

spark


In [10]:
# Hardcode the file path to your sample taxi CSV
# TAXI_CSV_PATH = 'sample/Sample NYC Data.csv'

# Load the CSV into a Spark DataFrame
df_taxi_raw = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv('trip_data/trip_data_1.csv')

# Show the schema and maybe a few rows
df_taxi_raw.printSchema()
df_taxi_raw.show(5, truncate=False)


root
 |-- medallion: string (nullable = true)
 |-- hack_license: string (nullable = true)
 |-- vendor_id: string (nullable = true)
 |-- rate_code: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_time_in_secs: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)

+--------------------------------+--------------------------------+---------+---------+------------------+-------------------+-------------------+---------------+-----------------+-------------+----------------+---------------+-----------------+----------------+
|medallion                       |hack_license                    |vendor_id|

In [11]:
df_taxi_raw = df_taxi_raw.drop("hack_license" ,"vendor_id", "rate_code", "passenger_count", "store_and_fwd_flag",'trip_time_in_secs','trip_distance')

In [12]:
df_taxi_raw.write \
    .option("compression", "snappy") \
    .parquet("trip_data/nyc_taxi_parquet")


In [14]:
df_taxi = spark.read.parquet("trip_data/nyc_taxi_parquet")
df_taxi.printSchema()
df_taxi.show(5)


root
 |-- medallion: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)

+--------------------+-------------------+-------------------+----------------+---------------+-----------------+----------------+
|           medallion|    pickup_datetime|   dropoff_datetime|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|
+--------------------+-------------------+-------------------+----------------+---------------+-----------------+----------------+
|6C8C5507F1928059F...|2013-01-07 05:59:28|2013-01-07 06:01:41|       -73.99041|       40.75613|       -73.983711|       40.765839|
|A3281E8510FED7EE0...|2013-01-05 13:22:48|2013-01-05 13:25:35|      -73.977295|      40.750359|       -73.976913|       40.760567|
|927C

In [13]:
# Hardcode the file path to your borough GeoJSON
GEOJSON_PATH = "sample/nyc-boroughs.geojson"

# Read the borough boundaries from GeoJSON
with open(GEOJSON_PATH, "r") as f:
    geojson_data = json.load(f)

features = geojson_data["features"]

# Function to get polygon area and borough code for sorting
def polygon_area(feature):
    return shape(feature["geometry"]).area

def borough_code(feature):
    return feature["properties"]["boroughCode"]

# Sort features by (boroughCode ascending, area descending)
features_sorted = sorted(
    features,
    key=lambda f: (borough_code(f), -polygon_area(f))
)

# Build list of (boroughName, shapelyShape)
borough_polygons = []
for f in features_sorted:
    bname = f["properties"]["borough"]
    geom  = shape(f["geometry"])
    borough_polygons.append((bname, geom))

# Broadcast to all executors
borough_polygons_bc = spark.sparkContext.broadcast(borough_polygons)

print("Loaded & broadcasted", len(borough_polygons), "borough polygons.")


Loaded & broadcasted 104 borough polygons.


In [15]:
from pyspark.sql.functions import udf,avg

# Create a Python function to identify the borough using Shapely
def find_borough(lon, lat):
    if lon is None or lat is None:
        return "Unknown"
    
    point = Point(lon, lat)
    for (b_name, b_polygon) in borough_polygons_bc.value:
        if b_polygon.contains(point):
            return b_name
    return "Unknown"

# Register this as a Spark UDF
find_borough_udf = udf(find_borough, StringType())

# Add pickup_borough and dropoff_borough columns
df_taxi = df_taxi_raw \
    .withColumn("pickup_borough", find_borough_udf(col("pickup_longitude"), col("pickup_latitude"))) \
    .withColumn("dropoff_borough", find_borough_udf(col("dropoff_longitude"), col("dropoff_latitude")))\
    # .withColumn("pickup_ts", to_timestamp(col("pickup_datetime"), "DD-MM-yy HH:mm"))\
    # .withColumn("dropoff_ts", to_timestamp(col("dropoff_datetime"), "DD-MM-yy HH:mm"))

df_taxi.show(3, truncate=False)


+--------------------------------+-------------------+-------------------+----------------+---------------+-----------------+----------------+--------------+---------------+
|medallion                       |pickup_datetime    |dropoff_datetime   |pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|pickup_borough|dropoff_borough|
+--------------------------------+-------------------+-------------------+----------------+---------------+-----------------+----------------+--------------+---------------+
|89D227B655E5C82AECF13C3F540D4CF4|2013-01-01 15:11:48|2013-01-01 15:18:10|-73.978165      |40.757977      |-73.989838       |40.751171       |Manhattan     |Manhattan      |
|0BD7C8F5BA12B88E0B67BED28BEA73D8|2013-01-06 00:18:35|2013-01-06 00:22:54|-74.006683      |40.731781      |-73.994499       |40.75066        |Manhattan     |Manhattan      |
|0BD7C8F5BA12B88E0B67BED28BEA73D8|2013-01-05 18:49:41|2013-01-05 18:54:23|-74.004707      |40.73777       |-74.009834       |40.72

In [16]:
# 2) Convert to proper TimestampType using the correct format

# 3) Compute duration in seconds
df_taxi = df_taxi.withColumn(
    "duration_sec",
    unix_timestamp(col("dropoff_datetime")) - unix_timestamp(col("pickup_datetime"))
)
# df_taxi.show(5)
# 4) Filter out outliers
df_taxi = df_taxi.filter((col("duration_sec") >= 0) & (col("duration_sec") <= 14400))

df_taxi.select("pickup_datetime", "dropoff_datetime", "duration_sec").show(10, truncate=False)


+-------------------+-------------------+------------+
|pickup_datetime    |dropoff_datetime   |duration_sec|
+-------------------+-------------------+------------+
|2013-01-01 15:11:48|2013-01-01 15:18:10|382         |
|2013-01-06 00:18:35|2013-01-06 00:22:54|259         |
|2013-01-05 18:49:41|2013-01-05 18:54:23|282         |
|2013-01-07 23:54:15|2013-01-07 23:58:20|245         |
|2013-01-07 23:25:03|2013-01-07 23:34:24|561         |
|2013-01-07 15:27:48|2013-01-07 15:38:37|649         |
|2013-01-08 11:01:15|2013-01-08 11:08:14|419         |
|2013-01-07 12:39:18|2013-01-07 13:10:56|1898        |
|2013-01-07 18:15:47|2013-01-07 18:20:47|300         |
|2013-01-07 15:33:28|2013-01-07 15:49:26|958         |
+-------------------+-------------------+------------+
only showing top 10 rows



In [17]:
df_taxi.printSchema()

root
 |-- medallion: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- pickup_borough: string (nullable = true)
 |-- dropoff_borough: string (nullable = true)
 |-- duration_sec: long (nullable = true)



In [18]:
from pyspark.sql.window import Window

# Window spec: partition by medallion, order by pickup time
w = Window.partitionBy("medallion").orderBy("pickup_datetime")

# Get the dropoff time of the previous trip
df_taxi = df_taxi.withColumn("prev_dropoff_datetime", lag(col("dropoff_datetime")).over(w))
# df_taxi.show(2)


# Compute idle time if gap ≤ 4 hours, else 0
df_taxi = df_taxi.withColumn(
    "idle_sec",
    when(
        (col("prev_dropoff_datetime").isNotNull()) &
        (
            (col("pickup_datetime").cast("long") - col("prev_dropoff_datetime").cast("long"))
            <= 14400
        ),
        col("pickup_datetime").cast("long") - col("prev_dropoff_datetime").cast("long")
    ).otherwise(0)
)
# df_taxi.show(5)

# Aggregate per medallion
df_util = df_taxi.groupBy("medallion").agg(
    _sum("duration_sec").alias("total_occupied_time"),
    _sum("idle_sec").alias("total_idle_time")
)

# Compute utilization
df_util = df_util.withColumn(
    "utilization",
    expr("total_occupied_time / (total_occupied_time + total_idle_time)")
)

print("=== Query 1: Utilization per Taxi/Driver ===")
df_util.show(10, truncate=False)


=== Query 1: Utilization per Taxi/Driver ===
+--------------------------------+-------------------+---------------+-------------------+
|medallion                       |total_occupied_time|total_idle_time|utilization        |
+--------------------------------+-------------------+---------------+-------------------+
|0038EF45118925A510975FD0CCD67192|780480             |1509360        |0.34084477518079864|
|00BD5D1AD3A96C997E49E0453A6C5DF1|810120             |1080300        |0.42853968959278893|
|01A2F4366180AEB433600BAEA196BFC7|990364             |1212008        |0.44968061708012996|
|01D13A056D9A26F84C328DFDD5534B55|629460             |826080         |0.4324580568036605 |
|01F24976B8E3FF46A08187C86F1F9AB7|375011             |227511         |0.6224021695473361 |
|02063AF23344CEA458E992EC448C5E73|638880             |929160         |0.4074385857503635 |
|024E99A049B748C443A541B2F6F55E5F|343440             |429240         |0.4444789563596832 |
|025B4E80E8A06FDB0FC0A05E319B0E60|829481     

In [19]:
# 1) Partition by medallion (the taxi ID) and order by dropoff time
w = Window.partitionBy("medallion").orderBy("dropoff_datetime")

# 2) For each trip, find the pickup time of the "next" trip for that same taxi
df_taxi = df_taxi.withColumn("next_pickup_datetime", lead(col("pickup_datetime")).over(w))

# 3) Compute gap in seconds = next pickup - current dropoff
#    Because subtracting two Timestamp columns in Spark 3+ yields an Interval,
#    we convert them to numeric (seconds) first (using either cast("long") or unix_timestamp).
df_taxi = df_taxi.withColumn(
    "gap_to_next_sec",
    unix_timestamp("next_pickup_datetime") - unix_timestamp("dropoff_datetime")
)


In [20]:
# (Optionally) filter out negative or unrealistically large gaps if needed:
df_taxi_valid_gap = df_taxi.filter(
    (col("gap_to_next_sec").isNotNull()) &
    (col("gap_to_next_sec") >= 0) &
    (col("gap_to_next_sec") <= 14400)  # e.g. max 4 hours, if desired
)
df_taxi_valid_gap.show(3)

+--------------------+-------------------+-------------------+----------------+---------------+-----------------+----------------+--------------+---------------+------------+---------------------+--------+--------------------+---------------+
|           medallion|    pickup_datetime|   dropoff_datetime|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|pickup_borough|dropoff_borough|duration_sec|prev_dropoff_datetime|idle_sec|next_pickup_datetime|gap_to_next_sec|
+--------------------+-------------------+-------------------+----------------+---------------+-----------------+----------------+--------------+---------------+------------+---------------------+--------+--------------------+---------------+
|0038EF45118925A51...|2013-01-01 00:07:00|2013-01-01 00:15:00|      -73.961807|      40.759731|       -73.977745|       40.749638|     Manhattan|      Manhattan|         480|                 NULL|       0| 2013-01-01 00:17:00|            120|
|0038EF45118925A51...|2013-0

In [21]:

# 4) Group by the dropoff borough and compute average gap
df_borough_gap = df_taxi_valid_gap.groupBy("dropoff_borough") \
    .agg(avg("gap_to_next_sec").alias("avg_time_to_next_fare_sec"))

df_borough_gap.show(truncate=False)

+---------------+-------------------------+
|dropoff_borough|avg_time_to_next_fare_sec|
+---------------+-------------------------+
|Queens         |2807.452292499565        |
|Unknown        |1235.5707161031012       |
|Brooklyn       |1650.6873404056078       |
|Staten Island  |3016.0944854232525       |
|Manhattan      |728.0687218132239        |
|Bronx          |2335.052113529537        |
+---------------+-------------------------+



In [22]:
df_same_borough = df_taxi.filter(col("pickup_borough") == col("dropoff_borough"))
same_borough_count = df_same_borough.count()

print("=== Query 3: Number of Trips Starting & Ending in the Same Borough ===")
print(f"Count = {same_borough_count}")


=== Query 3: Number of Trips Starting & Ending in the Same Borough ===
Count = 13108532


In [23]:
df_diff_borough = df_taxi.filter(col("pickup_borough") != col("dropoff_borough"))
diff_borough_count = df_diff_borough.count()

print("=== Query 4: Number of Trips Starting in One Borough & Ending in Another ===")
print(f"Count = {diff_borough_count}")


=== Query 4: Number of Trips Starting in One Borough & Ending in Another ===
Count = 1667722


In [24]:
spark.stop()