# Project 2

## Setup

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, floor, window, concat_ws
from pyspark.sql.types import StructType, StringType, DoubleType, TimestampType

In [None]:
# Initialize SparkSession with the Kafka JAR
spark = SparkSession.builder \
    .appName("KafkaTaxiStream") \
    .getOrCreate()

print("✅ Spark Session created successfully!")

In [None]:
# Define Schema for Incoming Data
schema = StructType() \
    .add("medallion", StringType()) \
    .add("hack_license", StringType()) \
    .add("pickup_datetime", TimestampType()) \
    .add("dropoff_datetime", TimestampType()) \
    .add("trip_time_in_secs", DoubleType()) \
    .add("trip_distance", DoubleType()) \
    .add("pickup_longitude", DoubleType()) \
    .add("pickup_latitude", DoubleType()) \
    .add("dropoff_longitude", DoubleType()) \
    .add("dropoff_latitude", DoubleType()) \
    .add("payment_type", StringType()) \
    .add("fare_amount", DoubleType()) \
    .add("surcharge", DoubleType()) \
    .add("mta_tax", DoubleType()) \
    .add("tip_amount", DoubleType()) \
    .add("tolls_amount", DoubleType())

In [None]:
# Read Kafka stream
taxi_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "taxi-trips") \
    .option("startingOffsets", "earliest") \
    .load()

# Parse JSON
parsed_taxi_stream = taxi_stream.selectExpr("CAST(value AS STRING) as json_str") \
    .select(from_json(col("json_str"), schema).alias("data")) \
    .select("data.*")

# Query 0: Data Cleansing and setup

In [None]:
# remove malformed and invalid data
cleaned_taxi_stream = parsed_taxi_stream \
    .filter("medallion IS NOT NULL AND hack_license IS NOT NULL") \
    .filter("pickup_longitude != 0.0 AND pickup_latitude != 0.0") \
    .filter("dropoff_longitude != 0.0 AND dropoff_latitude != 0.0") \
    .filter("pickup_longitude BETWEEN -74.913585 AND -73.0") \
    .filter("pickup_latitude BETWEEN 40.0 AND 41.474937") \
    .filter("dropoff_longitude BETWEEN -74.913585 AND -73.0") \
    .filter("dropoff_latitude BETWEEN 40.0 AND 41.474937")

In [None]:
query = cleaned_taxi_stream.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("taxi_trips_cleaned") \
    .start()

In [None]:
# View cleansed stream in notebook
spark.sql("SELECT * FROM taxi_trips_cleaned").show(truncate=False)

# Query 2: Profitable Areas

## Part 1

In [None]:
from pyproj import Transformer
from shapely.geometry import Point
from pyspark.sql.functions import udf

In [None]:
# transform latitude and longitude to UTM meters, accurate for NYC
transformer = Transformer.from_crs("EPSG:4326", "EPSG:32618", always_xy=True)

reference_lat = 41.474937
reference_lon = -74.913585

# convert the centre of cell 1.1 to meters
origin_x, origin_y = transformer.transform(reference_lon, reference_lat)
cell_size_meters = 250

In [None]:
# Main function to calculate how far the location is from the origin
def get_cell_id(lat, lon):
    if lat is None or lon is None:
        return None
    try:
        x, y = transformer.transform(lon, lat)
        dx = int((x - origin_x) // cell_size_meters) + 1 # how many cells east
        dy = int((origin_y - y) // cell_size_meters) + 1 # how many cells south
        if 1 <= dx <= 600 and 1 <= dy <= 600: # validate
            return f"{dx}.{dy}" # put together
    except:
        return None

# create spark udf
get_cell_udf = udf(get_cell_id, StringType())

In [None]:
# Take cleaned taxi stream data and convert pickup and dropoff locations into start and end cell IDs
stream_with_cells = cleaned_taxi_stream \
    .withColumn("start_cell_id", get_cell_udf("pickup_latitude", "pickup_longitude")) \
    .withColumn("end_cell_id", get_cell_udf("dropoff_latitude", "dropoff_longitude")) \
    .filter("start_cell_id IS NOT NULL AND end_cell_id IS NOT NULL")

In [None]:
# Count the number of rides for each route in the last 30 minutes 
frequent_routes = stream_with_cells \
    .withWatermark("dropoff_datetime", "30 minutes") \
    .groupBy(
        window(col("dropoff_datetime"), "30 minutes"),
        col("start_cell_id"),
        col("end_cell_id")
    ) \
    .count() \
    .select("window", "start_cell_id", "end_cell_id", "count") \
    .orderBy(col("count").desc())

In [None]:
# store results in top_routes table
frequent_routes.writeStream \
    .outputMode("complete") \
    .format("memory") \
    .queryName("top_routes") \
    .start()

In [None]:
spark.sql("SELECT * FROM top_routes LIMIT 10").show(truncate=False)