# Project 2

## Setup

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, floor, window, concat_ws
from pyspark.sql.types import StructType, StringType, DoubleType, TimestampType

In [2]:
# Initialize SparkSession with the Kafka JAR
spark = SparkSession.builder \
    .appName("KafkaTaxiStream") \
    .getOrCreate()

print("✅ Spark Session created successfully!")

✅ Spark Session created successfully!


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 56070)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/usr/local/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates =

In [3]:
# Define Schema for Incoming Data
schema = StructType() \
    .add("medallion", StringType()) \
    .add("hack_license", StringType()) \
    .add("pickup_datetime", TimestampType()) \
    .add("dropoff_datetime", TimestampType()) \
    .add("trip_time_in_secs", DoubleType()) \
    .add("trip_distance", DoubleType()) \
    .add("pickup_longitude", DoubleType()) \
    .add("pickup_latitude", DoubleType()) \
    .add("dropoff_longitude", DoubleType()) \
    .add("dropoff_latitude", DoubleType()) \
    .add("payment_type", StringType()) \
    .add("fare_amount", DoubleType()) \
    .add("surcharge", DoubleType()) \
    .add("mta_tax", DoubleType()) \
    .add("tip_amount", DoubleType()) \
    .add("tolls_amount", DoubleType())

In [4]:
# Read Kafka stream
taxi_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "taxi-trips") \
    .option("startingOffsets", "earliest") \
    .load()

# Parse JSON
parsed_taxi_stream = taxi_stream.selectExpr("CAST(value AS STRING) as json_str") \
    .select(from_json(col("json_str"), schema).alias("data")) \
    .select("data.*")

# Query 0: Data Cleansing and setup

In [5]:
# remove malformed and invalid data
cleaned_taxi_stream = parsed_taxi_stream \
    .filter("medallion IS NOT NULL AND hack_license IS NOT NULL") \
    .filter("pickup_longitude != 0.0 AND pickup_latitude != 0.0") \
    .filter("dropoff_longitude != 0.0 AND dropoff_latitude != 0.0")

In [6]:
query = cleaned_taxi_stream.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("taxi_trips_cleaned") \
    .start()

In [7]:
# View cleansed stream in notebook
spark.sql("SELECT * FROM taxi_trips_cleaned").show(truncate=False)

+---------+------------+---------------+----------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+
|medallion|hack_license|pickup_datetime|dropoff_datetime|trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|
+---------+------------+---------------+----------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+
+---------+------------+---------------+----------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+



# Query 1: Frequent Routes

## Part 1

In [8]:
from pyspark.sql.functions import udf

In [9]:
reference_lat = 41.474937
reference_lon = -74.913585
total_cells = 300

# Cell sizes
cell_size_lat_deg = 0.004491556  # 500m south
cell_size_lon_deg = 0.005986     # 500m east

In [10]:
# Main function to calculate how far the location is from the origin
def get_cell_id(lat, lon):
    if lat is None or lon is None:
        return None
    try:
        dx = int((lon - reference_lon) / cell_size_lon_deg) + 1 # how many cells east
        dy = int((reference_lat - lat) / cell_size_lat_deg) + 1 # how many cells south
        if 1 <= dx <= total_cells and 1 <= dy <= total_cells: # validate
            return f"{dx}.{dy}"
        else:
            return None
    except:
        return None

# create spark udf
get_cell_udf = udf(get_cell_id, StringType())

In [11]:
# Take cleaned taxi stream data and convert pickup and dropoff locations into start and end cell IDs
stream_with_cells = cleaned_taxi_stream \
    .withColumn("start_cell_id", get_cell_udf("pickup_latitude", "pickup_longitude")) \
    .withColumn("end_cell_id", get_cell_udf("dropoff_latitude", "dropoff_longitude")) \
    .filter("start_cell_id IS NOT NULL AND end_cell_id IS NOT NULL")

In [12]:
# Count the number of rides for each route in the last 30 minutes 
frequent_routes = stream_with_cells \
    .withWatermark("dropoff_datetime", "30 minutes") \
    .groupBy(
        window(col("dropoff_datetime"), "30 minutes"),
        col("start_cell_id"),
        col("end_cell_id")
    ) \
    .count() \
    .select("start_cell_id", "end_cell_id", "count") \
    .orderBy(col("count").desc())

In [13]:
# store results
frequent_routes.writeStream \
    .outputMode("complete") \
    .format("memory") \
    .queryName("top_routes") \
    .start()

<pyspark.sql.streaming.query.StreamingQuery at 0x7f57d85216d0>

In [14]:
spark.sql("SELECT * FROM top_routes LIMIT 10").show(truncate=False)

+-------------+-----------+-----+
|start_cell_id|end_cell_id|count|
+-------------+-----------+-----+
+-------------+-----------+-----+



## Part 2

# Query 2: Profitable Areas

## Part 1

In [15]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, expr
from pyspark.sql.functions import current_timestamp
from pyspark.sql.functions import window, max as max_

In [16]:
# divide last query 1 degrees by 2
cell_size_lat_deg = 0.002245778   # 250m south
cell_size_lon_deg = 0.002993      # 250m east
total_cells = 600

get_cell_udf = udf(get_cell_id, StringType())

In [17]:
profitAgg = cleaned_taxi_stream \
    .filter("fare_amount >= 0 AND tip_amount >= 0") \
    .withColumn("start_cell_id", get_cell_udf("pickup_latitude", "pickup_longitude")) \
    .withColumn("profit", col("fare_amount") + col("tip_amount")) \
    .filter("start_cell_id IS NOT NULL") \
    .withWatermark("dropoff_datetime", "15 minutes") \
    .groupBy(window("dropoff_datetime", "15 minutes"), col("start_cell_id")) \
    .agg(
        expr("percentile_approx(profit, 0.5)").alias("median_profit"),
        max_("pickup_datetime").alias("pickup_datetime"),
        max_("dropoff_datetime").alias("dropoff_datetime")
    ) \
    .selectExpr("start_cell_id as cell_id", 
                "median_profit", 
                "window.end as profit_window_end", 
                "pickup_datetime", 
                "dropoff_datetime")

profitAgg.writeStream \
    .format("memory") \
    .queryName("profitAgg") \
    .outputMode("complete") \
    .trigger(processingTime="30 seconds") \
    .start()

<pyspark.sql.streaming.query.StreamingQuery at 0x7f57d0308a50>

In [18]:
emptyAgg = cleaned_taxi_stream \
    .withColumn("dropoff_cell_id", get_cell_udf("dropoff_latitude", "dropoff_longitude")) \
    .filter("dropoff_cell_id IS NOT NULL") \
    .withWatermark("dropoff_datetime", "30 minutes") \
    .groupBy(window("dropoff_datetime", "30 minutes"), col("dropoff_cell_id")) \
    .agg(expr("approx_count_distinct(medallion)").alias("empty_taxis")) \
    .selectExpr("dropoff_cell_id as cell_id", "empty_taxis", "window.end as empty_window_end")

emptyAgg.writeStream \
    .format("memory") \
    .queryName("emptyAgg") \
    .outputMode("complete") \
    .trigger(processingTime="30 seconds") \
    .start()


<pyspark.sql.streaming.query.StreamingQuery at 0x7f57d030b590>

In [19]:
def process_batch(_, __):
    spark.sql("""
        SELECT 
            p.pickup_datetime, p.dropoff_datetime, p.cell_id AS profitable_cell_id,
            e.empty_taxis AS empty_taxies_in_cell, p.median_profit AS median_profit_in_cell,
            CASE WHEN e.empty_taxis = 0 THEN NULL ELSE p.median_profit / e.empty_taxis END AS profitability_of_cell
        FROM profitAgg p
        JOIN emptyAgg e ON p.cell_id = e.cell_id
        ORDER BY profitability_of_cell DESC
        LIMIT 10
    """).show(truncate=False, n=50)

cleaned_taxi_stream.selectExpr("CAST(NULL AS STRING) as dummy") \
    .writeStream \
    .foreachBatch(process_batch) \
    .outputMode("update") \
    .trigger(processingTime="30 seconds") \
    .start()


<pyspark.sql.streaming.query.StreamingQuery at 0x7f57d813efd0>

+---------------+----------------+------------------+--------------------+---------------------+---------------------+
|pickup_datetime|dropoff_datetime|profitable_cell_id|empty_taxies_in_cell|median_profit_in_cell|profitability_of_cell|
+---------------+----------------+------------------+--------------------+---------------------+---------------------+
+---------------+----------------+------------------+--------------------+---------------------+---------------------+

+---------------+----------------+------------------+--------------------+---------------------+---------------------+
|pickup_datetime|dropoff_datetime|profitable_cell_id|empty_taxies_in_cell|median_profit_in_cell|profitability_of_cell|
+---------------+----------------+------------------+--------------------+---------------------+---------------------+
+---------------+----------------+------------------+--------------------+---------------------+---------------------+

+-------------------+-------------------+-----