In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
from delta import *
import os
import time
# !pip install delta-spark

In [2]:
# Create SparkSession with Delta Lake support
builder = SparkSession.builder.appName("SensorDataWindow") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.memory", "5g")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("trip_time_in_secs", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("payment_type", StringType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("surcharge", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("total_amount", DoubleType(), True)
])

# Query 0

In [4]:
#original, unmodified dataset
rides_df = (spark.read
            .schema(schema)
            .csv("input/minified_sorted_data.csv") #path
            )

In [5]:
rides_df.printSchema()

root
 |-- medallion: string (nullable = true)
 |-- hack_license: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- trip_time_in_secs: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- surcharge: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)



In [6]:
# remove rows with null values
initial_count = rides_df.count()
rides_df = rides_df.dropna()

print(f"Removed {initial_count - rides_df.count()} lines")

Removed 30 lines


In [8]:
# check if medallion and hash license are valid md5, remove the row otherwise

MD5_PATTERN = r"^[a-fA-F0-9]{32}$"

initial_count = rides_df.count()

rides_df = rides_df.filter(
    col("medallion").rlike(MD5_PATTERN) &
    col("hack_license").rlike(MD5_PATTERN)
)

print(f"Removed {initial_count - rides_df.count()} lines")

Removed 0 lines


In [9]:
# remove rows with invalid pickup of dropoff times

initial_count = rides_df.count()

rides_df = rides_df.filter(
    col("dropoff_datetime") > col("pickup_datetime")
)

print(f"Removed {initial_count - rides_df.count()} lines")

Removed 17848 lines


In [10]:
# check for illogical numeric values

initial_count = rides_df.count()

rides_df = rides_df.filter(
    (col("trip_time_in_secs") > 0) &
    (col("trip_distance") > 0) &
    (col("fare_amount") >= 0) &
    (col("surcharge") >= 0) &
    (col("mta_tax") >= 0) &
    (col("tip_amount") >= 0) &
    (col("tolls_amount") >= 0)
)

print(f"Removed {initial_count - rides_df.count()} lines")

Removed 31626 lines


In [11]:
# remove lines with invalid fare calculation, eg where total_amount does not equal the sum of all fees

initial_count = rides_df.count()

rides_df = rides_df.filter(
    col("total_amount") ==
    (
        col("fare_amount") + 
        col("surcharge") + 
        col("mta_tax") + 
        col("tip_amount") + 
        col("tolls_amount")
    )
)

print(f"Removed {initial_count - rides_df.count()} lines")

Removed 135788 lines


In [12]:
rides_df.count()

7314708

In [13]:
rides_df_1000 = rides_df.limit(1000)

rides_df_1000.write.csv("input/rides_df_1000", header=True, mode="overwrite")

In [12]:
# Write the cleaned dataset to file
rides_df.write.csv("input/cleaned_minified_data", header=True, mode="overwrite")

# Query 1

In [27]:
# For inspecting the data
rides_df = (spark.read
            .schema(schema)
            .option("header", "true")
            .csv("input/rides_df_1000") #path
            )

In [15]:
rides_df.show(5, False)

+--------------------------------+--------------------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
|medallion                       |hack_license                    |pickup_datetime    |dropoff_datetime   |trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|total_amount|
+--------------------------------+--------------------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
|8CCC1CF4D81808ECCACC59F5E3A449CC|314B14F9C3CBFC03580340D82DF85C30|2013-01-12 05:26:51|2013-01-12 05:34:48|476              |2.1        

In [4]:
rides_stream = (spark.readStream
                .schema(schema)
                .option("header", "true")
                .csv("input/cleaned_minified_data") #path
                )

In [6]:
# Center of the first grid cell
start_lat = 41.474937
start_long = -74.913585

def distance_meters(lat, long):
    # Every degree is roughly 111.32 km or 111,320 meters
    m_per_degree = 111320

    # TODO this assumes the earth is flat.
    lat_distance = lat - start_lat * m_per_degree
    long_distance = long - start_long * m_per_degree

    return lat_distance, long_distance

In [None]:
import math

# UDF for converting latitude and longitude to a grid cell ID
# Courtesy of Claude 3.7
def lat_long_to_grid(lat, long):
    # Barryville reference point (center of cell 1.1)
    reference_lat = 41.474937
    reference_long = -74.913585
    
    # Calculate distance from reference point
    # For latitude: 1 degree ~ 111 km (varies slightly with latitude)
    # Moving south means decreasing latitude
    lat_dist_km = (reference_lat - lat) * 111.0  # Distance south in km
    
    # For longitude: 1 degree ~ 111 * cos(latitude) km
    # Moving east means increasing longitude
    long_dist_km = (long - reference_long) * 111.0 * math.cos(math.radians(reference_lat))  # Distance east in km
    
    # Convert to meters
    lat_dist_m = lat_dist_km * 1000
    long_dist_m = long_dist_km * 1000
    
    # Check if outside the grid (more than 150km south or east from reference)
    if lat_dist_m < 0 or lat_dist_m > 150000 or long_dist_m < 0 or long_dist_m > 150000:
        return None
    
    # Calculate cell IDs
    # Cell 1.1 starts at reference point (center of the cell)
    # Each cell is 500m x 500m
    # To get the cell number, divide by 500 and add 1
    
    # For the first component (east direction)
    cell_east = int(long_dist_m / 500) + 1
    
    # For the second component (south direction)
    cell_south = int(lat_dist_m / 500) + 1
    
    # Cell ID as "east.south"
    return f"{cell_east}.{cell_south}"

# Register the UDF with Spark
lat_long_to_grid_udf = udf(lat_long_to_grid, StringType())

In [31]:
rides_df.show(5, False)

+--------------------------------+--------------------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
|medallion                       |hack_license                    |pickup_datetime    |dropoff_datetime   |trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|total_amount|
+--------------------------------+--------------------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
|5EE2C4D3BF57BDB455E74B03B89E43A7|E96EF8F6E6122591F9465376043B946D|2013-01-01 00:00:09|2013-01-01 00:00:36|26               |0.1        

In [None]:
most_frequent_routes_query = (rides_df
    .withColumn("start_cell", lat_long_to_grid_udf(col("pickup_latitude"), col("pickup_longitude")))
    .withColumn("end_cell", lat_long_to_grid_udf(col("dropoff_latitude"), col("dropoff_longitude")))
    .filter(col("start_cell").isNotNull() & col("end_cell").isNotNull()) # The udf returns None for invalid cells
    .groupBy(
        window(col("pickup_datetime"), "30 minutes"),
        col("start_cell"),
        col("end_cell")
    )
    .agg(count("*").alias("Number_of_Rides"))
    .orderBy(col("Number_of_Rides").desc())
)

Number of results: 0
+-----------+----------+--------+---------------+
|time_window|start_cell|end_cell|number_of_rides|
+-----------+----------+--------+---------------+
+-----------+----------+--------+---------------+



In [49]:
# Test the reference point (should be in cell 1.1)
print(lat_long_to_grid(41.474937, -74.913585))  # Should return "1.1"

# Test a point 1km east and 1km south (should be in cell 3.3)
print(lat_long_to_grid(41.465937, -74.903585))  # ~1km south and ~1km east

# Test a point outside the grid
print(lat_long_to_grid(40.5, -73.9))  # NYC coordinates - should be None or far south/east

1.1
2.2
169.217


In [40]:
# Test the UDF separately
test_df = rides_df.withColumn("start_cell", lat_long_to_grid_udf(col("pickup_latitude"), col("pickup_longitude")))
test_df = test_df.withColumn("end_cell", lat_long_to_grid_udf(col("dropoff_latitude"), col("dropoff_longitude")))

# Check how many rows have valid cells
valid_cells = test_df.filter(col("start_cell").isNotNull() & col("end_cell").isNotNull())
print(f"Rows with valid start and end cells: {valid_cells.count()}")
print(f"Percentage of valid cells: {valid_cells.count() / test_df.count() * 100:.2f}%")

# Show some sample rows with lat/long and the resulting cells
test_df.select("pickup_latitude", "pickup_longitude", "start_cell", 
               "dropoff_latitude", "dropoff_longitude", "end_cell").show(10)

Rows with valid start and end cells: 0
Percentage of valid cells: 0.00%
+---------------+----------------+----------+----------------+-----------------+--------+
|pickup_latitude|pickup_longitude|start_cell|dropoff_latitude|dropoff_longitude|end_cell|
+---------------+----------------+----------+----------------+-----------------+--------+
|      40.725124|       -73.99221|      NULL|       40.726658|       -73.991646|    NULL|
|      40.768005|        -73.9701|      NULL|       40.767834|       -73.969772|    NULL|
|      40.749657|      -73.975441|      NULL|       40.751991|       -73.977333|    NULL|
|      40.720531|      -74.005165|      NULL|       40.725655|       -74.003929|    NULL|
|      40.716976|      -73.956528|      NULL|       40.715008|        -73.96244|    NULL|
|      40.790169|       -73.95208|      NULL|       40.794323|       -73.948921|    NULL|
|      40.708313|      -74.003197|      NULL|       40.706551|       -74.005608|    NULL|
|      40.757912|      -73.9

In [36]:
# For static data analysis (batch processing)
most_frequent_routes_query = (rides_df
    .withColumn("start_cell", lat_long_to_grid_udf(col("pickup_latitude"), col("pickup_longitude")))
    .withColumn("end_cell", lat_long_to_grid_udf(col("dropoff_latitude"), col("dropoff_longitude")))
    .filter(col("start_cell").isNotNull() & col("end_cell").isNotNull()) # The udf returns None for invalid cells
    # Use date_trunc instead of window for batch processing
    .withColumn("time_window", date_trunc("hour", col("pickup_datetime")))
    .groupBy(
        col("time_window"),
        col("start_cell"),
        col("end_cell")
    )
    .agg(count("*").alias("number_of_rides"))
    .orderBy(col("number_of_rides").desc())
)

# Check if the query returns results
print(f"Number of results: {most_frequent_routes_query.count()}")
most_frequent_routes_query.show(10)

Number of results: 0
+-----------+----------+--------+---------------+
|time_window|start_cell|end_cell|number_of_rides|
+-----------+----------+--------+---------------+
+-----------+----------+--------+---------------+



In [34]:
most_frequent_routes_query.show(5, False)

+------+----------+--------+---------------+
|window|start_cell|end_cell|Number_of_Rides|
+------+----------+--------+---------------+
+------+----------+--------+---------------+



In [13]:
# Function to create table if not exists
def create_table_if_exists(output_path, table_name):
    data_exists = False
    for _i in range(5):  # Retry for 60 seconds
        try:
            time.sleep(1)
            files = os.listdir(output_path)
            for _f in files:
                if ".parquet" in _f:
                    if len(os.listdir(f"{output_path}/_delta_log")) > 0:
                        print("data exists")
                        data_exists = True
                        break
            if data_exists:
                spark.sql(f"CREATE TABLE IF NOT EXISTS {table_name} USING DELTA LOCATION '{output_path}'")
                break
        except Exception as e:
            print(e)  # Uncomment if you want to see exceptions
            pass

In [25]:
checkpoint_path = "output/_checkpoint"
output_path = "output/most_frequent_routes"
os.makedirs(output_path, exist_ok=True)

table_name = "most_frequent_routes"
create_table_if_exists(output_path, table_name)

(most_frequent_routes_query.
    writeStream
    .format("delta")
    .outputMode("complete")
    .option("checkpointLocation", checkpoint_path)
    .queryName("most_frequent_routes")
    .trigger(processingTime="5 seconds")
    .start(output_path)
    .awaitTermination(timeout = 120)
)

data exists


False

In [24]:
# Check for active streams and stop them if they exist
for query in spark.streams.active:
    if query.name == "most_frequent_routes":
        print(f"Stopping existing query: {query.name}")
        query.stop()

Stopping existing query: most_frequent_routes


In [26]:
df = spark.read.format("delta").load(output_path)
df.toPandas().sort_values(by=['window'], ascending=True).head(10)

Unnamed: 0,window,start_cell,end_cell,Number_of_Rides
