In [1]:
import numpy as np
import pandas as pd

In [None]:
# Without pyspark, I used pyarrow earlier to load parquet file into a dataframe.
# pyarrow == 21.0.0 is extremely new and unstable â€” it was only released recently and includes several known issues with extension types, registration, and parquet reading.
# ! pip install pyarrow==17.0.0

# Install pyspark

! pip3 install pyspark


In [None]:
# Check if Java is installed as it's required to use pyspark

! java -version

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("test") \
    .master("local[*]") \
    .getOrCreate()


# This might give 4 warnings at first but it is completely safe to ignore these - 

# | Warning               | Meaning                    |
# | --------------------- | -------------------------- |
# | incubator modules     | Java optimization feature  |
# | hostname loopback     | Spark selecting correct IP |
# | native-hadoop missing | You're not using Hadoop    |
# | default log level     | Normal Spark behavior      |


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/04 22:53:06 WARN Utils: Your hostname, Ishaans-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.7 instead (on interface en0)
25/12/04 22:53:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/04 22:53:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/04 22:53:07 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [None]:
# To ensure pyspark's sparksession is loaded correctly

print("Spark version:", spark.version)

In [3]:
# Ensure that the parquet file is in the same folder as this notebook 

df = spark.read.parquet('yellow_tripdata_2025-08.parquet')

In [4]:
df.show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|cbd_congestion_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+
|       2| 2025-08-01 00:52:23|  2025-08-01 01:12:20|              1|         8.44|         1|                 N|         138|    

In [None]:
# Columns that we need to consider - 

# 1. passenger_count - Number of passengers are entered by the driver, so may not always be accurate.

# 2. store_and_fwd_flag - Whether the trip record was stored in the vehicle before being sent to the server. 
# Values: Y = stored (no network during trip) N = sent in real-time
# This means rows with value = N are more reliable

# 3. PULocationID and DOLocationID - These are geographical zone IDs defined by TLC. Maps to neighborhoods (e.g., Midtown, Queens). 
# Need to join them with a lookup file (taxi_zone_lookup.csv).

# 4. payment_type - How the rider paid: 1 Credit card, 2 Cash, 3 No charge, 4 Dispute, 5 Unknown, 6 Voided trip.
# May not need rows with Dispute, Unknown, or Voided trip values.

# 5. tip_amount - Tip paid by the passenger (typically via credit card). Cash tips are not included so it may hinder the tipping analysis.
# May also need to remove the rows where payment_type column's value is 'Cash' for it. 

# 6. total_amount - Does not include cash tips.

In [6]:
# To stop JVM, releases memory, and avoid port issues.

spark.stop()