# Module 5 Homework
In this homework we'll put what we learned about Spark in practice.

In [1]:
# Install PySpark and ngrok
!pip install pyspark[sql,pandas_on_spark,connect] pyngrok



In [12]:
# Import necessary libraries
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyngrok import ngrok
from google.colab import userdata

In [13]:
# --- Securely Set ngrok Auth Token ---
# You can store your auth token in a Colab secret
# To do this, go to the left sidebar, click the "Secrets" tab (key icon),
# and add a secret named "NGROK_AUTH_TOKEN" with your token as the value.

In [14]:
def setup_spark_ngrok():
    """Sets up SparkSession and ngrok tunnel, returning both objects."""
    try:
        ngrok_auth_token = userdata.get('NGROK_AUTH_TOKEN')
    except KeyError:
        print("ngrok auth token not found. Please add it as a Colab secret.")
        return None, None

    if ngrok_auth_token:
        ngrok.set_auth_token(ngrok_auth_token)
        spark = SparkSession.builder.appName("ColabSparkUI").getOrCreate()
        ngrok_tunnel = ngrok.connect(4040)
        print("Spark UI Public URL:", ngrok_tunnel.public_url)
        return spark, ngrok_tunnel
    else:
        return None, None

In [15]:
spark, ngrok_tunnel = setup_spark_ngrok()





Spark UI Public URL: https://57cd-34-125-55-49.ngrok-free.app


In [16]:
if spark and ngrok_tunnel:
    # You can now use 'spark' and 'ngrok_tunnel' in subsequent cells
    print("SparkSession and ngrok tunnel are ready.")

SparkSession and ngrok tunnel are ready.


## Question 1: Install Spark and PySpark

- Install Spark
- Run PySpark
- Create a local spark session
- Execute spark.version.

What's the output?

> [!NOTE]
> To install PySpark follow this [guide](https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/05-batch/setup/pyspark.md)

> Please note in this notebook I am using Google Colab rather than local Spark installation

In [17]:
# --- Question 1: Install Spark and PySpark ---
print("Spark Version:", spark.version)

Spark Version: 3.5.5


In [18]:
# --- Download Data ---
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-10.parquet
!wget https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv

--2025-03-06 17:11:01--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-10.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 18.239.38.147, 18.239.38.181, 18.239.38.83, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|18.239.38.147|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 64346071 (61M) [binary/octet-stream]
Saving to: ‘yellow_tripdata_2024-10.parquet’


2025-03-06 17:11:05 (16.3 MB/s) - ‘yellow_tripdata_2024-10.parquet’ saved [64346071/64346071]

--2025-03-06 17:11:05--  https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 18.239.38.147, 18.239.38.181, 18.239.38.83, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|18.239.38.147|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12331 (12K) [text/csv]
Saving to: ‘taxi_zone_lookup.c

## Question 2: Yellow October 2024

Read the October 2024 Yellow into a Spark Dataframe.

Repartition the Dataframe to 4 partitions and save it to parquet.

What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches.

In [19]:
df = spark.read.parquet("yellow_tripdata_2024-10.parquet")
df_repartitioned = df.repartition(4)
df_repartitioned.write.parquet("yellow_tripdata_2024-10-repartitioned.parquet")

In [20]:
 # Calculate average file size
import os
parquet_files = [f for f in os.listdir("yellow_tripdata_2024-10-repartitioned.parquet") if f.endswith(".parquet")]
total_size = 0
for file in parquet_files:
    total_size += os.path.getsize(os.path.join("yellow_tripdata_2024-10-repartitioned.parquet", file))
average_size_mb = (total_size / len(parquet_files)) / (1024 * 1024)
print(f"Average Parquet file size: {average_size_mb} MB")

Average Parquet file size: 23.042235136032104 MB


I will also both Spark Sql and Data Frame

In [26]:
# Using Spark SQL
# Register the DataFrame as a temporary view
df.createOrReplaceTempView("trip_data")

## Question 3: Count records

How many taxi trips were there on the 15th of October?

Consider only trips that started on the 15th of October.

- 85,567
- 105,567
- 125,567
- 145,567

In [21]:
count = df.filter(F.to_date(df.tpep_pickup_datetime) == '2024-10-15').count()
print(f"Taxi trips on October 15th: {count}")

Taxi trips on October 15th: 128893


In [27]:
# Use Spark SQL to count trips on October 15th
count = spark.sql(
    """
    SELECT COUNT(*) AS trip_count
    FROM trip_data
    WHERE DATE(tpep_pickup_datetime) = '2024-10-15'
    """
).collect()[0][0]

print(f"Taxi trips on October 15th: {count}")

Taxi trips on October 15th: 128893


## Question 4: Longest trip

What is the length of the longest trip in the dataset in hours?

- 122
- 142
- 162
- 182

In [23]:
# --- Question 4: Longest trip ---
longest_trip_hours = df.agg(
    F.max(
        (F.unix_timestamp(F.col("tpep_dropoff_datetime")) - F.unix_timestamp(F.col("tpep_pickup_datetime"))) / 3600
    )
).collect()[0][0]
print(f"Longest trip in hours: {longest_trip_hours}")

Longest trip in hours: 162.61777777777777


In [25]:
# Use Spark SQL to calculate the longest trip in hours
longest_trip_hours = spark.sql(
    """
    SELECT MAX((UNIX_TIMESTAMP(tpep_dropoff_datetime) - UNIX_TIMESTAMP(tpep_pickup_datetime)) / 3600) AS longest_trip_hours
    FROM trip_data
    """
).collect()[0][0]

print(f"Longest trip in hours: {longest_trip_hours}")

Longest trip in hours: 162.61777777777777


## Question 5: User Interface

Spark’s User Interface which shows the application's dashboard runs on which local port?

- 80
- 443
- 4040
- 8080

In [28]:
# --- Question 5: User Interface ---
ngrok_tunnel = ngrok.connect(4040)
print("Spark UI Public URL:", ngrok_tunnel.public_url)

Spark UI Public URL: https://5e02-34-125-55-49.ngrok-free.app


In [30]:
spark_ui_port = spark.sparkContext.uiWebUrl.split(":")[-1]
print(f"Spark UI Port: {spark_ui_port}")

Spark UI Port: 4040


![Image](img/SparkUI_GoogleColab.png)

## Question 6: Least frequent pickup location zone

Load the zone lookup data into a temp view in Spark:

```bash
wget https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv
```

Using the zone lookup data and the Yellow October 2024 data, what is the name of the LEAST frequent pickup location Zone?

- Governor's Island/Ellis Island/Liberty Island
- Arden Heights
- Rikers Island
- Jamaica Bay

In [32]:
# --- Question 6: Least frequent pickup location zone ---
# Using SQL
# Load zone lookup data into a temporary view
zone_df = spark.read.csv("taxi_zone_lookup.csv", header=True)
zone_df.createOrReplaceTempView("zone_lookup")

# Use Spark SQL to find the least frequent pickup zone
least_frequent_zone = spark.sql(
    """
    SELECT Zone, COUNT(*) AS pickup_count
    FROM trip_data
    JOIN zone_lookup ON trip_data.PULocationID = zone_lookup.LocationID
    GROUP BY Zone
    ORDER BY pickup_count ASC
    LIMIT 1
    """
).collect()[0][0]  # Extract the Zone value

print(f"Least frequent pickup zone: {least_frequent_zone}")

Least frequent pickup zone: Governor's Island/Ellis Island/Liberty Island


In [31]:
# Using DataFrme
zone_df = spark.read.csv("taxi_zone_lookup.csv", header=True)
df_with_zones = df.join(zone_df, df.PULocationID == zone_df.LocationID, "left")

pickup_counts = df_with_zones.groupBy("Zone").count()
least_frequent_zone = pickup_counts.orderBy("count").first().Zone
print(f"Least frequent pickup zone: {least_frequent_zone}")

Least frequent pickup zone: Governor's Island/Ellis Island/Liberty Island


In [33]:
# Keep the tunnel alive (optional, but recommended)
# import time
# while True:
#     time.sleep(60) # keep the tunnel alive for 1 minute.

# Stop ngrok tunnel and SparkSession when finished (optional)
ngrok.disconnect(ngrok_tunnel.public_url)
spark.stop()





## Submitting the solutions

- Form for submitting: https://courses.datatalks.club/de-zoomcamp-2025/homework/hw5
- Deadline: See the website