In [1]:
from os import environ as env

from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col, dayofmonth
from pyspark.sql.types import *

## Spark Setup

In [2]:
!wget -nc https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop2.jar -P /tmp/spark_jars -q

In [3]:
conf = (
    SparkConf()
        .setMaster('local[*]')
        .setAppName("pyspark-playground")
        .set("spark.cores.max", 4)
        .set("spark.driver.memory", "2g")
        .set("spark.executor.memory", "8g")
        .set("spark.sql.execution.arrow.pyspark.enabled", "true")
        .set("spark.jars", "/tmp/spark_jars/gcs-connector-latest-hadoop2.jar")
)

In [4]:
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

hadoop_conf = sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", env["GOOGLE_APPLICATION_CREDENTIALS"])

24/03/01 13:01:28 WARN Utils: Your hostname, magi.local resolves to a loopback address: 127.0.0.1; using 192.168.15.29 instead (on interface en0)
24/03/01 13:01:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/03/01 13:01:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

## Load Datasets from GCS

### FHV Dataset

In [6]:
raw_fhv = spark.read.parquet("gs://iobruno-lakehouse-raw/nyc_tlc_trip_record_data/fhv/fhv_tripdata_2019-10.snappy.parquet")

                                                                                

In [7]:
fhv = raw_fhv.select(
    col('dispatching_base_num'),
    col('Affiliated_base_number').alias('affiliated_base_num'),
    col('PUlocationID').alias('pickup_location_id'),
    col('DOlocationID').alias('dropoff_location_id'),
    col('pickup_datetime').cast('timestamp'),
    col('dropOff_datetime').cast('timestamp'),
    col('SR_Flag').alias('sr_flag'),
)

In [8]:
fhv.createOrReplaceTempView('fhv')

### Zone Lookup Dataset

In [9]:
zones_schema = StructType([
    StructField("LocationID", IntegerType(), True),
    StructField("Borough", StringType(), True),
    StructField("Zone", StringType(), True),
    StructField("service_zone", StringType(), True)
])

In [10]:
zones = (
    spark.read
        .option("header", True)
        .schema(zones_schema)
        .csv("gs://iobruno-lakehouse-raw/nyc_tlc_trip_record_data/zone_lookup/*.csv.gz")
)

In [11]:
zones = zones.select(
    col('LocationID').alias('location_id'),
    col('Borough').alias('borough'),
    col('Zone').alias('zone'),
    col('service_zone')
)

In [12]:
zones.createOrReplaceTempView('zones')

## Homework

### Question 1

**Install Spark and PySpark** 

- Install Spark
- Run PySpark
- Create a local spark session
- Execute spark.version.

What's the output?

In [13]:
spark.version

'3.5.0'

### Question 2

**FHV October 2019**  
- Read the October 2019 FHV into a Spark Dataframe with a schema as we did in the lessons
- Repartition the Dataframe to 6 partitions and save it to parquet.

What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)?  
Select the answer which most closely matches
- [ ] 1 MB
- [x] 6 MB
- [ ] 25 MB
- [ ] 87 MB

In [14]:
df = spark.sql("""SELECT * FROM fhv""")

In [15]:
df.repartition(6)\
    .write\
    .mode("overwrite")\
    .parquet("/tmp/dtc-homework/fhv/")

                                                                                

In [16]:
!ls -lh /tmp/dtc-homework/fhv/

total 76768
-rw-r--r--@ 1 iobruno  wheel     0B Mar  1 13:01 _SUCCESS
-rw-r--r--@ 1 iobruno  wheel   6.3M Mar  1 13:01 part-00000-83f7a6b5-f525-418c-8ecb-b125533cee70-c000.snappy.parquet
-rw-r--r--@ 1 iobruno  wheel   6.2M Mar  1 13:01 part-00001-83f7a6b5-f525-418c-8ecb-b125533cee70-c000.snappy.parquet
-rw-r--r--@ 1 iobruno  wheel   6.2M Mar  1 13:01 part-00002-83f7a6b5-f525-418c-8ecb-b125533cee70-c000.snappy.parquet
-rw-r--r--@ 1 iobruno  wheel   6.2M Mar  1 13:01 part-00003-83f7a6b5-f525-418c-8ecb-b125533cee70-c000.snappy.parquet
-rw-r--r--@ 1 iobruno  wheel   6.2M Mar  1 13:01 part-00004-83f7a6b5-f525-418c-8ecb-b125533cee70-c000.snappy.parquet
-rw-r--r--@ 1 iobruno  wheel   6.2M Mar  1 13:01 part-00005-83f7a6b5-f525-418c-8ecb-b125533cee70-c000.snappy.parquet


### Question 3

**Count records**  

How many taxi trips were there on the 15th of October?  

Consider only trips that started on the 15th of October
- [ ] 108,164
- [ ] 12,856
- [ ] 452,470
- [x] 62,610

In [17]:
spark.sql("""
    WITH trips_per_month_day AS (
        select
            month(pickup_datetime) as month,
            dayofmonth(pickup_datetime) as day, 
            count(1) as num_trips
        from
            fhv
        group by 
            month(pickup_datetime), 
            dayofmonth(pickup_datetime)
    )

    select * 
    from trips_per_month_day 
    where 
        month = 10 
        and day = 15
""").take(1)

                                                                                

[Row(month=10, day=15, num_trips=62610)]

### Question 4: 

**Longest trip for each day**  

What is the length of the longest trip in the dataset in hours?
- [x] 631,152.50 Hours
- [ ] 243.44 Hours
- [ ] 7.68 Hours
- [ ] 3.32 Hours

In [18]:
spark.sql("""
    with trip_records AS (
        select
            pickup_location_id,
            dropoff_location_id,
            pickup_datetime,
            dropoff_datetime,
            (cast(dropoff_datetime as long) - cast(pickup_datetime as long))/3600 as duration_in_hours
        from 
            fhv
    )

    select 
        duration_in_hours,
        dense_rank() over (order by duration_in_hours desc) as rnk
    from
        trip_records t

""").take(1)

                                                                                

[Row(duration_in_hours=631152.5, rnk=1)]

### Question 5: 

**User Interface**

Spark’s User Interface which shows the application's dashboard runs on which local port?

- [ ] 80
- [ ] 443
- [x] 4040
- [ ] 8080

### Question 6: 

**Least frequent pickup location zone**

Load the zone lookup data into a temp view in Spark [Zone Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv)  

Using the zone lookup data and the FHV October 2019 data, what is the name of the LEAST frequent pickup location Zone?
- [ ] East Chelsea
- [x] Jamaica Bay
- [ ] Union Sq
- [ ] Crown Heights North

In [19]:
spark.sql("""
    with trips_per_location AS (
        select
            pickup_location_id,
            count(1) as num_trips,
            dense_rank() over (order by count(1) asc) as rnk
        from
            fhv
        group by
            pickup_location_id
    )

    select
        pu.zone,
        t.num_trips,
        t.rnk
    from
        trips_per_location t
    inner join
        zones pu on t.pickup_location_id = pu.location_id
    where
        rnk = 1
""").take(1)

                                                                                

[Row(zone='Jamaica Bay', num_trips=1, rnk=1)]