# Module 5 Homework
In this homework we'll put what we learned about Spark in practice.

In [1]:
# Install PySpark and ngrok
!pip install pyspark[sql,pandas_on_spark,connect]

Collecting pyspark[connect,pandas_on_spark,sql]
  Downloading pyspark-3.5.5.tar.gz (317.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.2/317.2 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7 (from pyspark[connect,pandas_on_spark,sql])
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.20.2 (from googleapis-common-protos>=1.56.4->pyspark[connect,pandas_on_spark,sql])
  Using cached protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Using cached protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl (319 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.5-py2.py3-none-

In [2]:
# Import necessary libraries
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [5]:
# Start spark session
spark = SparkSession.builder.appName("LocalSparkUI").getOrCreate()

## Question 1: Install Spark and PySpark

- Install Spark
- Run PySpark
- Create a local spark session
- Execute spark.version.

What's the output?

> [!NOTE]
> To install PySpark follow this [guide](https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/05-batch/setup/pyspark.md)

In [6]:
# --- Question 1: Install Spark and PySpark ---
print("Spark Version:", spark.version)

Spark Version: 3.5.5


In [7]:
# --- Download Data ---
!mkdir -p data  # Create the 'data' directory if it doesn't exist
!wget -P data https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-10.parquet
!wget -P data https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv

--2025-03-06 18:47:25--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-10.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 18.239.238.133, 18.239.238.212, 18.239.238.152, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|18.239.238.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 64346071 (61M) [binary/octet-stream]
Saving to: ‘data/yellow_tripdata_2024-10.parquet’


2025-03-06 18:47:26 (62.5 MB/s) - ‘data/yellow_tripdata_2024-10.parquet’ saved [64346071/64346071]

--2025-03-06 18:47:27--  https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 18.239.238.152, 18.239.238.119, 18.239.238.212, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|18.239.238.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12331 (12K) [text/csv]
Saving to:

## Question 2: Yellow October 2024

Read the October 2024 Yellow into a Spark Dataframe.

Repartition the Dataframe to 4 partitions and save it to parquet.

What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches.

In [8]:
df = spark.read.parquet("data/yellow_tripdata_2024-10.parquet")
df_repartitioned = df.repartition(4)
df_repartitioned.write.parquet("data/yellow_tripdata_2024-10-repartitioned.parquet")

                                                                                

In [16]:
 # Calculate average file size
parquet_files = [f for f in os.listdir("data/yellow_tripdata_2024-10-repartitioned.parquet") if f.endswith(".parquet")]
total_size = 0
for file in parquet_files:
    total_size += os.path.getsize(os.path.join("data/yellow_tripdata_2024-10-repartitioned.parquet", file))
average_size_mb = (total_size / len(parquet_files)) / (1024 * 1024)
print(f"Average Parquet file size: {average_size_mb} MB")

Average Parquet file size: 22.394158363342285 MB


In [17]:
from pyspark.sql.functions import input_file_name, sum

# Read Parquet files using binaryFiles
file_sizes = spark.read.format("binaryFile").load("data/yellow_tripdata_2024-10-repartitioned.parquet/*.parquet") \
    .select(input_file_name().alias("path"), "length")  # Select 'length' column directly

# Calculate the total size and count of files
total_size = file_sizes.agg(sum("length")).collect()[0][0]
num_files = file_sizes.count()

# Calculate the average size in MB
average_size_mb = (total_size / num_files) / (1024 * 1024)

print(f"Average Parquet file size: {average_size_mb} MB")

Average Parquet file size: 22.394158363342285 MB


## Question 3: Count records 

How many taxi trips were there on the 15th of October?

Consider only trips that started on the 15th of October.

In [18]:
# Using Spark SQL
# Register the DataFrame as a temporary view
df.createOrReplaceTempView("trip_data")

In [19]:
# Use Spark SQL to count trips on October 15th
count = spark.sql(
    """
    SELECT COUNT(*) AS trip_count
    FROM trip_data
    WHERE DATE(tpep_pickup_datetime) = '2024-10-15'
    """
).collect()[0][0]

print(f"Taxi trips on October 15th: {count}")

Taxi trips on October 15th: 128893


In [20]:
# Using data frame instead of SQL
count = df.filter(F.to_date(df.tpep_pickup_datetime) == '2024-10-15').count()
print(f"Taxi trips on October 15th: {count}")

Taxi trips on October 15th: 128893


## Question 4: Longest trip

What is the length of the longest trip in the dataset in hours?

In [21]:
# Use Spark SQL to calculate the longest trip in hours
longest_trip_hours = spark.sql(
    """
    SELECT MAX((UNIX_TIMESTAMP(tpep_dropoff_datetime) - UNIX_TIMESTAMP(tpep_pickup_datetime)) / 3600) AS longest_trip_hours
    FROM trip_data
    """
).collect()[0][0]

print(f"Longest trip in hours: {longest_trip_hours}")

[Stage 24:>                                                         (0 + 4) / 4]

Longest trip in hours: 162.61777777777777


                                                                                

In [22]:
# Using Data Frame instead
longest_trip_hours = df.agg(
    F.max(
        (F.unix_timestamp(F.col("tpep_dropoff_datetime")) - F.unix_timestamp(F.col("tpep_pickup_datetime"))) / 3600
    )
).collect()[0][0]
print(f"Longest trip in hours: {longest_trip_hours}")

Longest trip in hours: 162.61777777777777


## Question 5: User Interface

Spark’s User Interface which shows the application's dashboard runs on which local port?

![Image](img/SparkUI_Local.png)

## Question 6: Least frequent pickup location zone

Load the zone lookup data into a temp view in Spark:

```bash
wget https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv
```

Using the zone lookup data and the Yellow October 2024 data, what is the name of the LEAST frequent pickup location Zone?

In [25]:
# Using SQL
# Load zone lookup data into a temporary view
zone_df = spark.read.csv("data/taxi_zone_lookup.csv", header=True)
zone_df.createOrReplaceTempView("zone_lookup")

# Use Spark SQL to find the least frequent pickup zone
least_frequent_zone = spark.sql(
    """
    SELECT Zone, COUNT(*) AS pickup_count
    FROM trip_data
    JOIN zone_lookup ON trip_data.PULocationID = zone_lookup.LocationID
    GROUP BY Zone
    ORDER BY pickup_count ASC
    LIMIT 1
    """
).collect()[0][0]  # Extract the Zone value

print(f"Least frequent pickup zone: {least_frequent_zone}")



Least frequent pickup zone: Governor's Island/Ellis Island/Liberty Island


                                                                                

In [27]:
df_with_zones = df.join(zone_df, df.PULocationID == zone_df.LocationID, "left")

pickup_counts = df_with_zones.groupBy("Zone").count()
least_frequent_zone = pickup_counts.orderBy("count").first().Zone
print(f"Least frequent pickup zone: {least_frequent_zone}")



Least frequent pickup zone: Governor's Island/Ellis Island/Liberty Island


                                                                                

In [28]:
spark.stop()