# Project 2

## Setup

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, floor, window, concat_ws
from pyspark.sql.types import StructType, StringType, DoubleType, TimestampType

In [2]:
# Initialize SparkSession with the Kafka JAR
spark = SparkSession.builder \
    .appName("KafkaTaxiStream") \
    .getOrCreate()

print("✅ Spark Session created successfully!")

✅ Spark Session created successfully!


In [3]:
# Define Schema for Incoming Data
schema = StructType() \
    .add("medallion", StringType()) \
    .add("hack_license", StringType()) \
    .add("pickup_datetime", TimestampType()) \
    .add("dropoff_datetime", TimestampType()) \
    .add("trip_time_in_secs", DoubleType()) \
    .add("trip_distance", DoubleType()) \
    .add("pickup_longitude", DoubleType()) \
    .add("pickup_latitude", DoubleType()) \
    .add("dropoff_longitude", DoubleType()) \
    .add("dropoff_latitude", DoubleType()) \
    .add("payment_type", StringType()) \
    .add("fare_amount", DoubleType()) \
    .add("surcharge", DoubleType()) \
    .add("mta_tax", DoubleType()) \
    .add("tip_amount", DoubleType()) \
    .add("tolls_amount", DoubleType())

In [4]:
# Read Kafka stream
taxi_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "taxi-trips") \
    .option("startingOffsets", "earliest") \
    .load()

# Parse JSON
parsed_taxi_stream = taxi_stream.selectExpr("CAST(value AS STRING) as json_str") \
    .select(from_json(col("json_str"), schema).alias("data")) \
    .select("data.*")

# Query 0: Data Cleansing and setup

In [5]:
# remove malformed and invalid data
cleaned_taxi_stream = parsed_taxi_stream \
    .filter("medallion IS NOT NULL AND hack_license IS NOT NULL") \
    .filter("pickup_longitude != 0.0 AND pickup_latitude != 0.0") \
    .filter("dropoff_longitude != 0.0 AND dropoff_latitude != 0.0") \
    .filter("pickup_longitude BETWEEN -74.913585 AND -73.0") \
    .filter("pickup_latitude BETWEEN 40.0 AND 41.474937") \
    .filter("dropoff_longitude BETWEEN -74.913585 AND -73.0") \
    .filter("dropoff_latitude BETWEEN 40.0 AND 41.474937")

In [6]:
query = cleaned_taxi_stream.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("taxi_trips_cleaned") \
    .start()

In [15]:
# View cleansed stream in notebook
spark.sql("SELECT * FROM taxi_trips_cleaned").show(truncate=False)

+--------------------------------+--------------------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+
|medallion                       |hack_license                    |pickup_datetime    |dropoff_datetime   |trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|
+--------------------------------+--------------------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+
|07290D3599E7A0D62097A346EFCC1FB5|E7750A37CAB07D0DFF0AF7E3573AC141|2013-01-01 00:00:00|2013-01-01 00:02:00|120.0            |0.44         |-73.956528      |40.716976      |-73

# Query 2: Profitable Areas

## Part 1

In [8]:
from pyproj import Transformer
from shapely.geometry import Point
from pyspark.sql.functions import udf

In [9]:
# transform latitude and longitude to UTM meters, accurate for NYC
transformer = Transformer.from_crs("EPSG:4326", "EPSG:32618", always_xy=True)

reference_lat = 41.474937
reference_lon = -74.913585

# convert the centre of cell 1.1 to meters
origin_x, origin_y = transformer.transform(reference_lon, reference_lat)
cell_size_meters = 250

In [10]:
# Main function to calculate how far the location is from the origin
def get_cell_id(lat, lon):
    if lat is None or lon is None:
        return None
    try:
        x, y = transformer.transform(lon, lat)
        dx = int((x - origin_x) // cell_size_meters) + 1 # how many cells east
        dy = int((origin_y - y) // cell_size_meters) + 1 # how many cells south
        if 1 <= dx <= 600 and 1 <= dy <= 600: # validate
            return f"{dx}.{dy}" # put together
    except:
        return None

# create spark udf
get_cell_udf = udf(get_cell_id, StringType())

In [11]:
# Take cleaned taxi stream data and convert pickup and dropoff locations into start and end cell IDs
stream_with_cells = cleaned_taxi_stream \
    .withColumn("start_cell_id", get_cell_udf("pickup_latitude", "pickup_longitude")) \
    .withColumn("end_cell_id", get_cell_udf("dropoff_latitude", "dropoff_longitude")) \
    .filter("start_cell_id IS NOT NULL AND end_cell_id IS NOT NULL")

In [12]:
# Count the number of rides for each route in the last 30 minutes 
frequent_routes = stream_with_cells \
    .withWatermark("dropoff_datetime", "30 minutes") \
    .groupBy(
        window(col("dropoff_datetime"), "30 minutes"),
        col("start_cell_id"),
        col("end_cell_id")
    ) \
    .count() \
    .select("window", "start_cell_id", "end_cell_id", "count") \
    .orderBy(col("count").desc())

In [13]:
# store results in top_routes table
frequent_routes.writeStream \
    .outputMode("complete") \
    .format("memory") \
    .queryName("top_routes") \
    .start()

<pyspark.sql.streaming.query.StreamingQuery at 0x7fe6a07fc990>

In [16]:
spark.sql("SELECT * FROM top_routes LIMIT 10").show(truncate=False)

+------------------------------------------+-------------+-----------+-----+
|window                                    |start_cell_id|end_cell_id|count|
+------------------------------------------+-------------+-----------+-----+
|{2013-01-01 00:00:00, 2013-01-01 00:30:00}|312.318      |312.318    |9    |
|{2013-01-01 00:00:00, 2013-01-01 00:30:00}|325.305      |322.309    |5    |
|{2013-01-01 00:00:00, 2013-01-01 00:30:00}|314.319      |314.319    |5    |
|{2013-01-01 00:00:00, 2013-01-01 00:30:00}|311.316      |307.324    |4    |
|{2013-01-01 00:00:00, 2013-01-01 00:30:00}|318.314      |319.320    |4    |
|{2013-01-01 00:00:00, 2013-01-01 00:30:00}|315.324      |312.321    |4    |
|{2013-01-01 00:00:00, 2013-01-01 00:30:00}|308.333      |308.331    |4    |
|{2013-01-01 00:00:00, 2013-01-01 00:30:00}|313.320      |313.320    |3    |
|{2013-01-01 00:00:00, 2013-01-01 00:30:00}|309.324      |309.320    |3    |
|{2013-01-01 00:00:00, 2013-01-01 00:30:00}|319.314      |323.307    |3    |