# Project 2

## Setup

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, floor, window, concat_ws
from pyspark.sql.functions import from_json, col, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType

In [2]:
# Initialize SparkSession with the Kafka JAR
spark = SparkSession.builder \
    .appName("KafkaTaxiStream") \
    .getOrCreate()

print("✅ Spark Session created successfully!")

✅ Spark Session created successfully!


In [3]:
json_schema = StructType() \
    .add("0", StringType()) \
    .add("1", StringType()) \
    .add("2", TimestampType()) \
    .add("3", TimestampType()) \
    .add("4", DoubleType()) \
    .add("5", DoubleType()) \
    .add("6", DoubleType()) \
    .add("7", DoubleType()) \
    .add("8", DoubleType()) \
    .add("9", DoubleType()) \
    .add("10", StringType()) \
    .add("11", DoubleType()) \
    .add("12", DoubleType()) \
    .add("13", DoubleType()) \
    .add("14", DoubleType()) \
    .add("15", DoubleType()) \
    .add("16", DoubleType())

# Define Schema for Incoming Data
schema = StructType() \
    .add("medallion", StringType()) \
    .add("hack_license", StringType()) \
    .add("pickup_datetime", TimestampType()) \
    .add("dropoff_datetime", TimestampType()) \
    .add("trip_time_in_secs", DoubleType()) \
    .add("trip_distance", DoubleType()) \
    .add("pickup_longitude", DoubleType()) \
    .add("pickup_latitude", DoubleType()) \
    .add("dropoff_longitude", DoubleType()) \
    .add("dropoff_latitude", DoubleType()) \
    .add("payment_type", StringType()) \
    .add("fare_amount", DoubleType()) \
    .add("surcharge", DoubleType()) \
    .add("mta_tax", DoubleType()) \
    .add("tip_amount", DoubleType()) \
    .add("tolls_amount", DoubleType()) \
    .add("total_amount", DoubleType())

# Query 0: Data Cleansing and setup

In [4]:
raw_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "taxi-trips") \
    .option("startingOffsets", "earliest") \
    .load()


parsed_taxi_stream = raw_df.selectExpr("CAST(value AS STRING) as json_data") \
    .select(from_json("json_data", json_schema).alias("data")) \
    .select("data.*") \
    .withColumnRenamed("0", "medallion") \
    .withColumnRenamed("1", "hack_license") \
    .withColumnRenamed("2", "pickup_datetime") \
    .withColumnRenamed("3", "dropoff_datetime") \
    .withColumnRenamed("4", "trip_time_in_secs") \
    .withColumnRenamed("5", "trip_distance") \
    .withColumnRenamed("6", "pickup_longitude") \
    .withColumnRenamed("7", "pickup_latitude") \
    .withColumnRenamed("8", "dropoff_longitude") \
    .withColumnRenamed("9", "dropoff_latitude") \
    .withColumnRenamed("10", "payment_type") \
    .withColumnRenamed("11", "fare_amount") \
    .withColumnRenamed("12", "surcharge") \
    .withColumnRenamed("13", "mta_tax") \
    .withColumnRenamed("14", "tip_amount") \
    .withColumnRenamed("15", "tolls_amount") \
    .withColumnRenamed("16", "total_amount")

In [5]:

# remove malformed and invalid data
cleaned_taxi_stream = parsed_taxi_stream \
    .filter("medallion IS NOT NULL AND hack_license IS NOT NULL") \
    .filter("pickup_longitude != 0.0 AND pickup_latitude != 0.0") \
    .filter("dropoff_longitude != 0.0 AND dropoff_latitude != 0.0")

In [6]:
query = cleaned_taxi_stream.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("taxi_trips_cleaned") \
    .start()

In [9]:
spark.sql("SELECT * FROM taxi_trips_cleaned").show(truncate=False)

+--------------------------------+--------------------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
|medallion                       |hack_license                    |pickup_datetime    |dropoff_datetime   |trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|total_amount|
+--------------------------------+--------------------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
|E79E74C15D90CD93B1564E91E3D64765|145038A0CC99D6982D8001BE668154CA|2013-01-01 00:01:00|2013-01-01 00:02:00|60.0             |0.45       