In [1]:
from os import environ as env

from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col, dayofmonth
from pyspark.sql.types import *

## Spark Setup

In [2]:
!mkdir -p /tmp/spark_jars/
!wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop2.jar -P /tmp/spark_jars -q

In [3]:
conf = SparkConf() \
        .setMaster('local[*]') \
        .setAppName("pyspark-playground") \
        .set("spark.cores.max", 4) \
        .set("spark.driver.memory", "2g") \
        .set("spark.executor.memory", "8g") \
        .set("spark.sql.execution.arrow.pyspark.enabled", "true") \
        .set("spark.jars", "/tmp/spark_jars/gcs-connector-latest-hadoop2.jar")

In [4]:
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

hadoop_conf = sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", env["GOOGLE_APPLICATION_CREDENTIALS"])

23/11/26 23:24:20 WARN Utils: Your hostname, magi.local resolves to a loopback address: 127.0.0.1; using 192.168.15.29 instead (on interface en0)
23/11/26 23:24:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/11/26 23:24:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

## Load Datasets from GCS

### FHV HV Dataset

In [6]:
fhvhv = spark.read.parquet("gs://iobruno-lakehouse-raw/nyc_trip_record_data/fhvhv/*.parquet")

                                                                                

In [7]:
fhvhv = fhvhv.select(
    col('dispatching_base_num'),
    col('Affiliated_base_number').alias('affiliated_base_num'),
    col('PULocationID').alias('pickup_location_id'),
    col('pickup_datetime').cast('timestamp'),
    col('DOLocationID').alias('dropoff_location_id'),
    col('dropoff_datetime').cast('timestamp'),
    col('SR_Flag').alias('sr_flag'),
)

In [8]:
fhvhv.createOrReplaceTempView('fhvhv')

### Zone Lookup Dataset

In [9]:
zones_schema = StructType([
    StructField("LocationID", IntegerType(), True),
    StructField("Borough", StringType(), True),
    StructField("Zone", StringType(), True),
    StructField("service_zone", StringType(), True)
])

In [10]:
zones = spark.read.option("header", True)\
            .schema(zones_schema)\
            .csv("gs://iobruno-lakehouse-raw/nyc_trip_record_data/zone_lookup/*.csv.gz")

In [11]:
zones = zones.select(
    col('LocationID').alias('location_id'),
    col('Borough').alias('borough'),
    col('Zone').alias('zone'),
    col('service_zone')
)

In [12]:
zones.createOrReplaceTempView('zones')

## Homework

### Question 1

**Install Spark and PySpark** 

- Install Spark
- Run PySpark
- Create a local spark session
- Execute spark.version.

What's the output?

In [13]:
spark.version

'3.5.0'

### Question 2

**HVFHW June 2021**

Read it with Spark using the same schema as we did in the lessons. We will use this dataset for all the remaining questions.  
Repartition it to 12 partitions and save it to parquet. What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB). Select the answer which most closely matches.  

In [14]:
df = spark.sql("""SELECT * FROM fhvhv""")

In [15]:
df.repartition(12)\
    .write\
    .option("compression", "snappy")\
    .mode("overwrite")\
    .parquet("/tmp/dtc/fhvhv-week5")

                                                                                

In [16]:
!ls -lh /tmp/dtc/fhvhv-week5/

total 563408
-rw-r--r--  1 iobruno  wheel     0B Nov 26 23:24 _SUCCESS
-rw-r--r--  1 iobruno  wheel    22M Nov 26 23:24 part-00000-53cf477f-3e02-4dec-8ad5-ead5080b28b0-c000.snappy.parquet
-rw-r--r--  1 iobruno  wheel    22M Nov 26 23:24 part-00001-53cf477f-3e02-4dec-8ad5-ead5080b28b0-c000.snappy.parquet
-rw-r--r--  1 iobruno  wheel    22M Nov 26 23:24 part-00002-53cf477f-3e02-4dec-8ad5-ead5080b28b0-c000.snappy.parquet
-rw-r--r--  1 iobruno  wheel    22M Nov 26 23:24 part-00003-53cf477f-3e02-4dec-8ad5-ead5080b28b0-c000.snappy.parquet
-rw-r--r--  1 iobruno  wheel    22M Nov 26 23:24 part-00004-53cf477f-3e02-4dec-8ad5-ead5080b28b0-c000.snappy.parquet
-rw-r--r--  1 iobruno  wheel    22M Nov 26 23:24 part-00005-53cf477f-3e02-4dec-8ad5-ead5080b28b0-c000.snappy.parquet
-rw-r--r--  1 iobruno  wheel    22M Nov 26 23:24 part-00006-53cf477f-3e02-4dec-8ad5-ead5080b28b0-c000.snappy.parquet
-rw-r--r--  1 iobruno  wheel    22M Nov 26 23:24 part-00007-53cf477f-3e02-4dec-8ad5-ead5080b28b0-c000.snappy.p

### Question 3

**Count records**  

How many taxi trips were there on June 15? Consider only trips that started on June 15.

In [17]:
spark.sql("""
    WITH trips_per_month_day AS (
        SELECT
            month(pickup_datetime) as month,
            dayofmonth(pickup_datetime) as day, 
            count(1) as num_trips
        FROM fhvhv
        GROUP BY month(pickup_datetime), dayofmonth(pickup_datetime)
    )

    SELECT * FROM trips_per_month_day 
    WHERE month = 6 AND day = 15
""").take(1)

                                                                                

[Row(month=6, day=15, num_trips=452470)]

### Question 4: 

**Longest trip for each day**  

Now calculate the duration for each trip. How long was the longest trip in Hours?

In [18]:
spark.sql("""
    WITH trip_records AS (
        SELECT
            pickup_location_id,
            dropoff_location_id,
            pickup_datetime,
            dropoff_datetime,
            (CAST(dropoff_datetime as LONG) - CAST(pickup_datetime as LONG)) as duration_secs
        FROM 
            fhvhv
    )
        
    SELECT 
        duration_secs/3600 as duration_hours,
        dense_rank() OVER (ORDER BY duration_secs DESC) as rnk
    FROM trip_records t

""").take(1)

                                                                                

[Row(duration_hours=66.8788888888889, rnk=1)]

### Question 6: 

**Most frequent pickup location zone**

Load the zone lookup data into a temp view in Spark [Zone Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv).  
Using the zone lookup data and the fhvhv June 2021 data, what is the name of the most frequent pickup location zone?

In [19]:
spark.sql("""
    WITH trips_per_location AS (
        SELECT
            f.pickup_location_id,
            count(1) as num_trips,
            dense_rank() OVER (ORDER BY count(1) DESC) as rnk
        FROM
            fhvhv f
        GROUP BY
            f.pickup_location_id
    )

    SELECT
        t.pickup_location_id,
        pu.zone,
        t.num_trips,
        t.rnk
    FROM
        trips_per_location t
    INNER JOIN zones pu
        ON t.pickup_location_id = pu.location_id
    WHERE
        t.rnk = 1
""").take(1)

                                                                                

[Row(pickup_location_id=61, zone='Crown Heights North', num_trips=231279, rnk=1)]