# Project 2

## Setup

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, floor, window, concat_ws
from pyspark.sql.types import StructType, StringType, DoubleType, TimestampType

In [2]:
# Initialize SparkSession with the Kafka JAR
spark = SparkSession.builder \
    .appName("KafkaTaxiStream") \
    .getOrCreate()

print("✅ Spark Session created successfully!")

✅ Spark Session created successfully!


In [3]:
# Define Schema for Incoming Data
schema = StructType() \
    .add("medallion", StringType()) \
    .add("hack_license", StringType()) \
    .add("pickup_datetime", TimestampType()) \
    .add("dropoff_datetime", TimestampType()) \
    .add("trip_time_in_secs", DoubleType()) \
    .add("trip_distance", DoubleType()) \
    .add("pickup_longitude", DoubleType()) \
    .add("pickup_latitude", DoubleType()) \
    .add("dropoff_longitude", DoubleType()) \
    .add("dropoff_latitude", DoubleType()) \
    .add("payment_type", StringType()) \
    .add("fare_amount", DoubleType()) \
    .add("surcharge", DoubleType()) \
    .add("mta_tax", DoubleType()) \
    .add("tip_amount", DoubleType()) \
    .add("tolls_amount", DoubleType())

In [4]:
# Read Kafka stream
taxi_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "taxi-trips") \
    .option("startingOffsets", "earliest") \
    .load()

# Parse JSON
parsed_taxi_stream = taxi_stream.selectExpr("CAST(value AS STRING) as json_str") \
    .select(from_json(col("json_str"), schema).alias("data")) \
    .select("data.*")

# Query 0: Data Cleansing and setup

In [5]:
# remove malformed and invalid data
cleaned_taxi_stream = parsed_taxi_stream \
    .filter("medallion IS NOT NULL AND hack_license IS NOT NULL") \
    .filter("pickup_longitude != 0.0 AND pickup_latitude != 0.0") \
    .filter("dropoff_longitude != 0.0 AND dropoff_latitude != 0.0")

In [6]:
query = cleaned_taxi_stream.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("taxi_trips_cleaned") \
    .start()

In [7]:
# View cleansed stream in notebook
spark.sql("SELECT * FROM taxi_trips_cleaned").show(truncate=False)

+---------+------------+---------------+----------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+
|medallion|hack_license|pickup_datetime|dropoff_datetime|trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|
+---------+------------+---------------+----------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+
+---------+------------+---------------+----------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+



# Query 1: Frequent Routes

## Part 1

In [8]:
from pyspark.sql.functions import udf

In [9]:
reference_lat = 41.474937
reference_lon = -74.913585
total_cells = 300

# Cell sizes
cell_size_lat_deg = 0.004491556  # 500m south
cell_size_lon_deg = 0.005986     # 500m east

In [10]:
# Main function to calculate how far the location is from the origin
def get_cell_id(lat, lon):
    if lat is None or lon is None:
        return None
    try:
        dx = int((lon - reference_lon) / cell_size_lon_deg) + 1 # how many cells east
        dy = int((reference_lat - lat) / cell_size_lat_deg) + 1 # how many cells south
        if 1 <= dx <= total_cells and 1 <= dy <= total_cells: # validate
            return f"{dx}.{dy}"
        else:
            return None
    except:
        return None

# create spark udf
get_cell_udf = udf(get_cell_id, StringType())

In [11]:
# Take cleaned taxi stream data and convert pickup and dropoff locations into start and end cell IDs
stream_with_cells = cleaned_taxi_stream \
    .withColumn("start_cell_id", get_cell_udf("pickup_latitude", "pickup_longitude")) \
    .withColumn("end_cell_id", get_cell_udf("dropoff_latitude", "dropoff_longitude")) \
    .filter("start_cell_id IS NOT NULL AND end_cell_id IS NOT NULL")

In [12]:
# Count the number of rides for each route in the last 30 minutes 
frequent_routes = stream_with_cells \
    .withWatermark("dropoff_datetime", "30 minutes") \
    .groupBy(
        window(col("dropoff_datetime"), "30 minutes"),
        col("start_cell_id"),
        col("end_cell_id")
    ) \
    .count() \
    .select("start_cell_id", "end_cell_id", "count") \
    .orderBy(col("count").desc())

In [13]:
# store results
frequent_routes.writeStream \
    .outputMode("complete") \
    .format("memory") \
    .queryName("top_routes") \
    .start()

<pyspark.sql.streaming.query.StreamingQuery at 0x7f53c05b3e10>

In [14]:
spark.sql("SELECT * FROM top_routes LIMIT 10").show(truncate=False)

+-------------+-----------+-----+
|start_cell_id|end_cell_id|count|
+-------------+-----------+-----+
+-------------+-----------+-----+



## Part 2

# Query 2: Profitable Areas

## Part 1

In [15]:
from pyspark.sql.functions import expr
from pyspark.sql import functions as F

In [16]:
# set variables for query 2

# divide last query 1 degrees by 2
cell_size_lat_deg = 0.002245778   # 250m south
cell_size_lon_deg = 0.002993      # 250m east

total_cells = 600

# create user defined function again with new variables
get_cell_udf = udf(get_cell_id, StringType())

In [17]:
# start by computing the profit for start cell 
profit_stream = cleaned_taxi_stream \
    .filter("fare_amount >= 0 AND tip_amount >= 0") \
    .withColumn("start_cell_id", get_cell_udf("pickup_latitude", "pickup_longitude")) \
    .withColumn("profit", col("fare_amount") + col("tip_amount")) \
    .filter("start_cell_id IS NOT NULL")

In [18]:
# apply 15 minute window
median_profit_window = profit_stream \
    .withWatermark("dropoff_datetime", "15 minutes") \
    .groupBy(
        F.window(col("dropoff_datetime"), "15 minutes"),
        col("start_cell_id")
    ) \
    .agg(
         expr("percentile_approx(profit, 0.5) as median_profit"),
         F.max("pickup_datetime").alias("agg_pickup_datetime"),
         F.max("dropoff_datetime").alias("agg_dropoff_datetime")
    )

profit_agg = median_profit_window.select(
    col("start_cell_id").alias("cell_id"),
    col("median_profit"),
    col("window.end").alias("profit_window_end"),
    col("agg_pickup_datetime"),
    col("agg_dropoff_datetime")
)

In [19]:
query_profit = profit_agg.writeStream \
    .format("memory") \
    .queryName("profitAgg") \
    .outputMode("complete") \
    .trigger(once=True) \
    .start()

query_profit.awaitTermination()

In [20]:
# Now you can query the in-memory table and show the results:
spark.sql("select * from profitAgg").show(truncate=False)

+-------+-----------------+-------------------+-------------------+--------------------+
|cell_id|median_profit    |profit_window_end  |agg_pickup_datetime|agg_dropoff_datetime|
+-------+-----------------+-------------------+-------------------+--------------------+
|326.343|4.0              |2013-01-01 00:30:00|2013-01-01 00:18:30|2013-01-01 00:21:56 |
|327.343|5.5              |2013-01-01 00:15:00|2013-01-01 00:09:00|2013-01-01 00:13:00 |
|321.352|7.5              |2013-01-01 00:30:00|2013-01-01 00:20:00|2013-01-01 00:25:00 |
|304.328|8.5              |2013-01-01 00:15:00|2013-01-01 00:13:47|2013-01-01 00:14:24 |
|326.288|7.5              |2013-01-01 00:30:00|2013-01-01 00:19:36|2013-01-01 00:26:47 |
|314.327|8.879999999999999|2013-01-01 00:15:00|2013-01-01 00:07:00|2013-01-01 00:11:00 |
|323.313|10.0             |2013-01-01 00:15:00|2013-01-01 00:06:00|2013-01-01 00:13:22 |
|308.344|18.0             |2013-01-01 00:30:00|2013-01-01 00:06:00|2013-01-01 00:25:00 |
|309.311|6.5         

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, expr

empty_taxi_agg = cleaned_taxi_stream \
    .withColumn("dropoff_cell_id", get_cell_udf("dropoff_latitude", "dropoff_longitude")) \
    .filter("dropoff_cell_id IS NOT NULL") \
    .withWatermark("dropoff_datetime", "30 minutes") \
    .groupBy(
         F.window(col("dropoff_datetime"), "30 minutes"),
         col("dropoff_cell_id")
    ) \
    .agg(expr("approx_count_distinct(medallion) as empty_taxis")) \
    .select(
         col("dropoff_cell_id").alias("cell_id"),
         col("empty_taxis"),
         col("window.end").alias("empty_window_end")
    )

# Write the aggregated results to an in-memory table using complete output mode.
query_empty = empty_taxi_agg.writeStream \
    .format("memory") \
    .queryName("emptyAgg") \
    .outputMode("complete") \
    .trigger(once=True) \
    .start()

query_empty.awaitTermination()

# Query and display the in-memory table:
spark.sql("SELECT * FROM emptyAgg").show(truncate=False)


In [None]:
# Load the in-memory tables as DataFrames.
profit_df = spark.table("profitAgg")
empty_df = spark.table("emptyAgg")

# Join the two DataFrames on 'cell_id' and compute profitability.
joined = profit_df.join(empty_df, "cell_id", "inner") \
    .withColumn("profitability", F.col("median_profit") / F.col("empty_taxis"))

# Select the actual aggregated pickup and dropoff timestamps, along with other required columns.
result = joined.select(
    col("agg_pickup_datetime").alias("pickup_datetime"),
    col("agg_dropoff_datetime").alias("dropoff_datetime"),
    col("cell_id").alias("profitable_cell_id"),
    col("empty_taxis").alias("empty_taxies_in_cell"),
    col("median_profit").alias("median_profit_in_cell"),
    col("profitability").alias("profitability_of_cell")
)


In [None]:
# Order the result by profitability in descending order and display the top 10 rows.
result.orderBy(F.col("profitability_of_cell").desc()).show(10, truncate=False)